From 69f3721a36d20e83f9282cc7ff8f9d8154a3a59c Mon Sep 17 00:00:00 2001 From: chezhang <1376507468@qq.com> Date: Thu, 4 Sep 2025 14:55:53 +0800 Subject: [PATCH 01/58] [fix] fix fail test when backend is mack --- .../batch_norm_kernel_register.cc | 10 +- .../conv_transpose_grad_kernel_register.cu | 40 - .../conv_transpose_grad_kernel_register.cu | 1114 +++++++++++++++++ .../impl/spectral_norm_grad_kernel_impl.h | 130 -- .../kernels/impl/spectral_norm_kernel_impl.h | 182 --- backends/metax_gpu/kernels/metax_context.cc | 1 + backends/metax_gpu/kernels/metax_context.h | 1 + .../instance_norm_grad_kerne_registerl.cu | 650 ++++++++++ .../instance_norm_kernel_register.cu | 253 ++++ .../spectral_norm_grad_kernel_register.cu | 22 + .../spectral_norm_kernel_register.cu | 22 + backends/metax_gpu/patch/paddle.patch | 462 +++++++ 12 files changed, 2534 insertions(+), 353 deletions(-) delete mode 100644 backends/metax_gpu/kernels/cuda_kernels/conv_transpose_grad_kernel_register.cu create mode 100644 backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu delete mode 100644 backends/metax_gpu/kernels/impl/spectral_norm_grad_kernel_impl.h delete mode 100644 backends/metax_gpu/kernels/impl/spectral_norm_kernel_impl.h create mode 100644 backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu create mode 100644 backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu create mode 100644 backends/metax_gpu/kernels/metax_kernel/spectral_norm_grad_kernel_register.cu create mode 100644 backends/metax_gpu/kernels/metax_kernel/spectral_norm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cc b/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cc index b12f208bec0..ac3d8b95062 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cc +++ b/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cc @@ -20,4 +20,12 @@ PD_CUSTOM_KERNEL_REGISTER(batch_norm_infer, ALL_LAYOUT, phi::BatchNormInferKernel, float, - phi::dtype::float16) {} + double, + phi::dtype::bfloat16, + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16 || + kernel_key.dtype() == phi::DataType::BFLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + } +} diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_grad_kernel_register.cu deleted file mode 100644 index dacced51df4..00000000000 --- a/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_grad_kernel_register.cu +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu" // NOLINT -PD_CUSTOM_KERNEL_REGISTER(conv2d_transpose_grad, - metax_gpu, - ALL_LAYOUT, - phi::Conv2dTransposeGradKernel, - float, - double) {} -PD_CUSTOM_KERNEL_REGISTER(conv2d_transpose_double_grad, - metax_gpu, - ALL_LAYOUT, - phi::Conv2dTransposeDoubleGradKernel, - float, - double) {} -PD_CUSTOM_KERNEL_REGISTER(conv3d_transpose_grad, - metax_gpu, - ALL_LAYOUT, - phi::Conv3dTransposeGradKernel, - float, - double) {} -PD_CUSTOM_KERNEL_REGISTER(depthwise_conv2d_transpose_grad, - metax_gpu, - ALL_LAYOUT, - phi::DepthwiseConv2dTransposeGradKernel, - float, - double) {} diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu new file mode 100644 index 00000000000..0067818d165 --- /dev/null +++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu @@ -0,0 +1,1114 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "kernels/gpudnn/conv_cudnn_v7.h" +#include "kernels/metax_context.h" +#include "paddle/common/ddim.h" +#include "paddle/phi/backends/context_pool.h" +#include "paddle/phi/backends/dynload/cudnn.h" +#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h" +#include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/conv_transpose_grad_kernel.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/padding.h" +#include "paddle/phi/kernels/funcs/slice.h" +#include "paddle/phi/kernels/transpose_kernel.h" + +namespace phi { + +using GPUDNNDataLayout = phi::backends::gpu::DataLayout; + +template +void ConvTransposeGradRawGPUDNNKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter) { + // 0-size + if (x.numel() == 0) { + if (dx) dev_ctx.template Alloc(dx); + if (dfilter) { + phi::Full(dev_ctx, + phi::IntArray(common::vectorize(dfilter->dims())), + 0, + dfilter); + } + return; + } + if (filter.numel() == 0) { + if (dfilter) dev_ctx.template Alloc(dfilter); + if (dx) { + phi::Full( + dev_ctx, phi::IntArray(common::vectorize(dx->dims())), 0, dx); + } + return; + } + + const T* filter_data = filter.data(); + std::vector paddings_ = paddings; + std::vector dilations_ = + dilations; // cudnn v5 does not support dilations + const GPUDNNDataLayout data_layout = + (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW + : GPUDNNDataLayout::kNHWC); + + // if channel_last, transpose to channel_first + DenseTensor x_transpose; + DenseTensor dout_transpose; + std::vector x_vec = common::vectorize(x.dims()); + std::vector out_vec = common::vectorize(dout.dims()); + if (data_layout == GPUDNNDataLayout::kNHWC) { + if (strides.size() == 2U) { + std::vector axis = {0, 3, 1, 2}; + for (size_t i = 0; i < axis.size(); ++i) { + x_vec[i] = x.dims()[axis[i]]; + out_vec[i] = dout.dims()[axis[i]]; + } + x_transpose = Transpose(dev_ctx, x, axis); + dout_transpose = Transpose(dev_ctx, dout, axis); + } else if (strides.size() == 3U) { + std::vector axis = {0, 4, 1, 2, 3}; + for (size_t i = 0; i < axis.size(); ++i) { + x_vec[i] = x.dims()[axis[i]]; + out_vec[i] = dout.dims()[axis[i]]; + } + x_transpose = Transpose(dev_ctx, x, axis); + dout_transpose = Transpose(dev_ctx, dout, axis); + } + } else { + x_transpose = x; + dout_transpose = dout; + } + + // update padding and dilation + auto x_dims = x_transpose.dims(); + auto filter_dims = filter.dims(); + DDim x_data_dims; + x_data_dims = slice_ddim(x_dims, 2, x_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = common::vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize); + + int data_dim = strides.size(); // 2d or 3d + bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim); + + std::vector x_pad(x_dims.size() * 2, 0); + DenseTensor transformed_dout; + std::vector padding_common(data_dim, 0); + if (!is_sys_pad) { + std::vector padding_diff(data_dim); + std::vector new_dout_shape_vec(data_dim + 2); + new_dout_shape_vec[0] = dout_transpose.dims()[0]; + new_dout_shape_vec[1] = dout_transpose.dims()[1]; + + for (size_t i = 0; i < data_dim; ++i) { + padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]); + padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]); + new_dout_shape_vec[i + 2] = + dout_transpose.dims()[i + 2] + padding_diff[i]; + x_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i]; + x_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i]; + } + + transformed_dout.Resize(common::make_ddim(new_dout_shape_vec)); + dev_ctx.template Alloc(&transformed_dout); + + const int rank = x_transpose.dims().size(); + T pad_value(0.0); + switch (rank) { + case 4: { + funcs::PadFunction( + dev_ctx, x_pad, dout_transpose, pad_value, &transformed_dout); + } break; + case 5: { + funcs::PadFunction( + dev_ctx, x_pad, dout_transpose, pad_value, &transformed_dout); + } break; + default: + PADDLE_THROW(errors::InvalidArgument( + "Op(ConvTranspose) only supports 4-D or 5-D x DenseTensor.")); + } + } else { + transformed_dout = dout_transpose; + if (paddings_.size() == data_dim) { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings_[i]; + } + } else { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings_[2 * i]; + } + } + } + + const T* x_data = x_transpose.data(); + const T* dout_data = transformed_dout.data(); + out_vec = common::vectorize(transformed_dout.dims()); + + // ------------------- cudnn descriptors --------------------- +#ifndef PADDLE_WITH_HIP + CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(transformed_dout); + CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(filter); + CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(x_transpose); +#endif + + GPUDNNDataLayout layout; + + if (strides.size() == 2U) { + layout = GPUDNNDataLayout::kNCHW; + } else { + layout = GPUDNNDataLayout::kNCDHW; + } + + int iwo_groups = groups; + int c_groups = 1; +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) + iwo_groups = 1; + c_groups = groups; + groups = 1; +#endif + + auto dtype = phi::backends::gpu::CudnnDataType::type; + // auto handle = dev_ctx.cudnn_handle(); + auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); + ConvArgs args1{handle, + &transformed_dout, + &filter, + &x_transpose, + strides, + padding_common, + dilations_, + dtype, + groups, + layout}; + ConvArgs args2{handle, + &transformed_dout, + &filter, + &x_transpose, + strides, + padding_common, + dilations_, + dtype, + groups, + layout}; + +#ifdef PADDLE_WITH_HIP + SearchResult fwd_result; + SearchResult filter_result; +#else + SearchResult fwd_result; + SearchResult filter_result; +#endif + + auto layout_tensor = phi::backends::gpu::GetCudnnTensorFormat(layout); + size_t workspace_size = 0; + bool deterministic = FLAGS_cudnn_deterministic; + T* dx_data = nullptr; + T* dfilter_data = nullptr; + + if (dx) { + dx_data = dev_ctx.template Alloc(dx); + + args1.idesc.set(transformed_dout, iwo_groups); + args1.wdesc.set(filter, layout_tensor, iwo_groups); + args1.odesc.set(x_transpose, iwo_groups); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations_, + phi::AllowTF32Cudnn(), + c_groups); +#ifdef PADDLE_WITH_HIP + using search1 = SearchAlgorithm; + workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); + fwd_result.algo = + search1::Find(args1, false, deterministic, workspace_size, dev_ctx); +#else + using search1 = SearchAlgorithm; + fwd_result = search1::Find(dev_ctx, args1, false, deterministic, false); + workspace_size = std::max( + workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo)); +#endif + } + + if (dfilter) { + dfilter_data = dev_ctx.template Alloc(dfilter); + + args2.idesc.set(transformed_dout, iwo_groups); + args2.wdesc.set(*dfilter, layout_tensor, iwo_groups); + args2.odesc.set(x_transpose, iwo_groups); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations_, + phi::AllowTF32Cudnn(), + c_groups); +#ifdef PADDLE_WITH_HIP + using search2 = SearchAlgorithm; + workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); + filter_result.algo = + search2::Find(args2, false, deterministic, workspace_size, dev_ctx); +#else + using search2 = SearchAlgorithm; + filter_result = + search2::Find(dev_ctx, args2, false, deterministic, false); + workspace_size = std::max( + workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo)); +#endif + } + + // ------------------- cudnn conv backward data --------------------- + // FIxME(typhoonzero): template type T may not be the same as cudnn call. + int x_offset = x.numel() / x.dims()[0] / groups; + int dout_offset = + transformed_dout.numel() / transformed_dout.dims()[0] / groups; + int filter_offset = filter.numel() / groups; + ScalingParamType alpha = 1.0f; + ScalingParamType beta = 0.0f; + // auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = GetDnnWorkspace( + const_cast(&(dev_ctx.GetAllocator())), dev_ctx.stream()); + if (dx) { +#ifdef PADDLE_WITH_HIP + // Because beta is zero, it is unnecessary to reset dx. + for (int g = 0; g < groups; g++) { + auto cudnn_func = [&](void* cudnn_workspace) { + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::miopenConvolutionForward(handle, + &alpha, + args1.idesc.desc(), + dout_data + dout_offset * g, + args1.wdesc.desc(), + filter_data + filter_offset * g, + args1.cdesc.desc(), + fwd_result.algo, + &beta, + args1.odesc.desc(), + dx_data + x_offset * g, + cudnn_workspace, + workspace_size)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size); + } +#else // PADDLE_WITH_HIP + ConvRunner::Apply(dev_ctx, + args1, + fwd_result, + dout_data, + filter_data, + dx_data, + groups, + dout_offset, + filter_offset, + x_offset, + workspace_size, + &workspace_handle, + false); +#endif // PADDLE_WITH_HIP + + if (data_layout == GPUDNNDataLayout::kNHWC) { + DenseTensor dx_transpose; + DenseTensor dx_nchw; + dx_nchw.ShareDataWith(*dx); + dx_nchw.Resize(common::make_ddim(x_vec)); + if (strides.size() == 2U) { + std::vector axis = {0, 2, 3, 1}; + dx_transpose = Transpose(dev_ctx, dx_nchw, axis); + *dx = dx_transpose; + } else if (strides.size() == 3U) { + std::vector axis = {0, 2, 3, 4, 1}; + dx_transpose = Transpose(dev_ctx, dx_nchw, axis); + *dx = dx_transpose; + } + } + } + + // ------------------- cudnn conv backward filter --------------------- + if (dfilter) { + // Because beta is zero, it is unnecessary to reset dfilter. + // Gradient with respect to the filter +#ifdef PADDLE_WITH_HIP + for (int g = 0; g < groups; g++) { + auto cudnn_func = [&](void* cudnn_workspace) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardWeights( + handle, + &alpha, + args2.odesc.desc(), + x_data + x_offset * g, + args2.idesc.desc(), + dout_data + dout_offset * g, + args2.cdesc.desc(), + filter_result.algo, + &beta, + args2.wdesc.desc(), + dfilter_data + filter_offset * g, + cudnn_workspace, + workspace_size)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size); + } +#else // PADDLE_WITH_HIP + ConvRunner::Apply(dev_ctx, + args2, + filter_result, + x_data, + dout_data, + dfilter_data, + groups, + dout_offset, + filter_offset, + x_offset, + workspace_size, + &workspace_handle, + false); +#endif // PADDLE_WITH_HIP + } +} + +template +void Conv2dTransposeGradGPUDNNKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const std::vector& strides, + const std::vector& paddings_, + const std::vector& output_padding, + const IntArray& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter) { + ConvTransposeGradRawGPUDNNKernel(dev_ctx, + x, + filter, + dout, + strides, + paddings_, + padding_algorithm, + groups, + dilations_, + data_format, + dx, + dfilter); +} + +/* + * Inputs: I, filter, dout, ddI, ddfilter + * Outputs: ddout, dfilter, dI + * ddo = conv_bp_data(filter, ddI) + conv_bp_data(ddfilter, I) + * dfilter = conv_bp_filter(dout, ddI) + * dI = conv(dout, ddfilter) + */ +template +void Conv2dTransposeDoubleGradGPUDNNKernel( + const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const DenseTensor& ddx, + const DenseTensor& ddfilter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const IntArray& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter, + DenseTensor* ddout) { + if (dx) { + dev_ctx.template Alloc(dx); + } + if (dfilter) { + dev_ctx.template Alloc(dfilter); + } + if (ddout) { + dev_ctx.template Alloc(ddout); + funcs::SetConstant set_zero; + set_zero(dev_ctx, ddout, static_cast(0)); + } + + const T* filter_ = filter.data(); + const T* dout_ = dout.data(); + const T* ddx_ = nullptr; + const T* ddfilter_ = nullptr; + T* dx_ = nullptr; + T* dfilter_ = nullptr; + T* ddout_ = nullptr; + T* transformed_dx_ = nullptr; + + std::vector paddings_ = paddings; + std::vector dilations_ = dilations; + + bool deterministic = FLAGS_cudnn_deterministic; + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + // transform DenseTensors to channel first----------- + DenseTensor transformed_x_channel(x.type()); + DenseTensor transformed_dout_channel(dout.type()); + DenseTensor transformed_ddx_channel(x.type()); + + DenseTensor transformed_dx_channel(x.type()); + DenseTensor transformed_ddout_channel(dout.type()); + + if (channel_last) { + ResizeToChannelFirst(dev_ctx, &x, &transformed_x_channel); + TransToChannelFirst(dev_ctx, &x, &transformed_x_channel); + + ResizeToChannelFirst(dev_ctx, &dout, &transformed_dout_channel); + TransToChannelFirst(dev_ctx, &dout, &transformed_dout_channel); + + ResizeToChannelFirst(dev_ctx, &ddx, &transformed_ddx_channel); + TransToChannelFirst(dev_ctx, &ddx, &transformed_ddx_channel); + + if (dx) { + ResizeToChannelFirst(dev_ctx, dx, &transformed_dx_channel); + dev_ctx.template Alloc(&transformed_dx_channel); + } + if (ddout) { + ResizeToChannelFirst( + dev_ctx, ddout, &transformed_ddout_channel); + } + } else { + transformed_x_channel = x; + transformed_dout_channel = dout; + transformed_ddx_channel = ddx; + + if (dx) { + transformed_dx_channel = *dx; + } + } + std::vector out_vec = + common::vectorize(transformed_dout_channel.dims()); + + auto x_dims = transformed_x_channel.dims(); + auto filter_dims = filter.dims(); + DDim x_data_dims = slice_ddim(x_dims, 2, x_dims.size()); + DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = common::vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize); + + int data_dim = strides.size(); // 2d or 3d + bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim); + DenseTensor transformed_x(x.type()); + DenseTensor transformed_ddx(x.type()); + + DenseTensor transformed_dout(dout.type()); + + std::vector padding_common(data_dim, 0); + std::vector input_pad(x.dims().size() * 2, 0); + + if (!is_sys_pad) { + // get pad + std::vector padding_diff(data_dim); + std::vector new_input_shape_vec(data_dim + 2); + std::vector new_output_grad_shape_vec(data_dim + 2); + + new_input_shape_vec[0] = transformed_x_channel.dims()[0]; + new_input_shape_vec[1] = transformed_x_channel.dims()[1]; + + new_output_grad_shape_vec[0] = transformed_dout_channel.dims()[0]; + new_output_grad_shape_vec[1] = transformed_dout_channel.dims()[1]; + + for (size_t i = 0; i < data_dim; ++i) { + padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]); + padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]); + new_input_shape_vec[i + 2] = + transformed_x_channel.dims()[i + 2] + padding_diff[i]; + + new_output_grad_shape_vec[i + 2] = + transformed_dout_channel.dims()[i + 2] + padding_diff[i]; + + input_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i]; + input_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i]; + } + DDim new_input_shape(common::make_ddim(new_input_shape_vec)); + transformed_x.Resize(new_input_shape); + transformed_ddx.Resize(new_input_shape); + transformed_dout.Resize(common::make_ddim(new_output_grad_shape_vec)); + + dev_ctx.template Alloc(&transformed_x); + dev_ctx.template Alloc(&transformed_ddx); + dev_ctx.template Alloc(&transformed_dout); + + // pad for input + const int rank = x.dims().size(); + T pad_value(0.0); + switch (rank) { + case 4: { + funcs::PadFunction(dev_ctx, + input_pad, + transformed_x_channel, + pad_value, + &transformed_x); + funcs::PadFunction(dev_ctx, + input_pad, + transformed_dout_channel, + pad_value, + &transformed_dout); + funcs::PadFunction(dev_ctx, + input_pad, + transformed_ddx_channel, + pad_value, + &transformed_ddx); + } break; + case 5: { + funcs::PadFunction(dev_ctx, + input_pad, + transformed_x_channel, + pad_value, + &transformed_x); + funcs::PadFunction(dev_ctx, + input_pad, + transformed_ddx_channel, + pad_value, + &transformed_ddx); + } break; + default: + PADDLE_THROW(errors::InvalidArgument( + "ConvOp only support tensors with 4 or 5 dimensions.")); + } + } else { + transformed_x = transformed_x_channel; + transformed_dout = transformed_dout_channel; + transformed_ddx = transformed_ddx_channel; + + if (paddings_.size() == data_dim) { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings_[i]; + } + } else { + for (size_t i = 0; i < data_dim; ++i) { + padding_common[i] = paddings_[2 * i]; + } + } + } + + std::vector starts(data_dim, 0); + std::vector ends(data_dim, 0); + std::vector axes(data_dim, 0); + for (size_t i = 0; i < data_dim; ++i) { + starts[i] = input_pad[2 * i + 4] * (strides[i] + 1); + ends[i] = starts[i] + out_vec[i + 2]; + axes[i] = i + 2; + } + + std::vector transformed_out_vec = out_vec; + for (size_t i = 0; i < data_dim; ++i) { + transformed_out_vec[i + 2] = + out_vec[i + 2] + + (input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] - + 2 * padding_common[i] + paddings_[2 * i] + paddings_[2 * i + 1]; + } + + if (!is_sys_pad) { + transformed_ddout_channel.Resize(common::make_ddim(transformed_out_vec)); + dev_ctx.template Alloc(&transformed_ddout_channel); + } else { + dev_ctx.template Alloc(ddout); + transformed_ddout_channel = *ddout; + transformed_ddout_channel.Resize(common::make_ddim(transformed_out_vec)); + } + + const T* x_ = transformed_x.data(); + + int iwo_group = groups; + int c_group = 1; +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) + iwo_group = 1; + c_group = groups; + groups = 1; +#endif + auto dtype = phi::backends::gpu::CudnnDataType::type; + + // auto handle = dev_ctx.cudnn_handle(); + auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); + auto layout = + phi::backends::gpu::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW); + + ConvArgs args1{handle, + &transformed_ddout_channel, + &filter, + &transformed_ddx, + strides, + padding_common, + dilations_, + dtype, + groups, + GPUDNNDataLayout::kNCHW}; + ConvArgs args2{handle, + &transformed_ddout_channel, + &ddfilter, + &transformed_x, + strides, + padding_common, + dilations_, + dtype, + groups, + GPUDNNDataLayout::kNCHW}; + + ConvArgs args3{handle, + &transformed_dout, + dfilter, + &transformed_ddx_channel, + strides, + padding_common, + dilations_, + dtype, + groups, + GPUDNNDataLayout::kNCHW}; + ConvArgs args4{handle, + &transformed_dout, + &ddfilter, + &transformed_dx_channel, + strides, + padding_common, + dilations_, + dtype, + groups, + GPUDNNDataLayout::kNCHW}; +#ifdef PADDLE_WITH_HIP + SearchResult bwd_result1; + SearchResult bwd_result2; + SearchResult filter_result; + SearchResult fwd_result; +#else + SearchResult bwd_result1; + SearchResult bwd_result2; + SearchResult filter_result; + SearchResult fwd_result; +#endif + + // ddo = conv(ddI, filter) + conv(I, ddfilter) + size_t workspace_size = 0; + + T* transformed_ddout_channel_ = nullptr; + + if (ddout) { + ddout_ = ddout->data(); + transformed_ddout_channel_ = transformed_ddout_channel.data(); + + args1.idesc.set(transformed_ddout_channel, iwo_group); + args1.wdesc.set(filter, layout, iwo_group); + args1.odesc.set(transformed_ddx, iwo_group); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations_, + phi::AllowTF32Cudnn(), + c_group); +#ifdef PADDLE_WITH_HIP + using search1 = SearchAlgorithm; + workspace_size = search1::GetWorkspaceSize(args1); + bwd_result1.algo = + search1::Find(args1, false, deterministic, workspace_size, dev_ctx); +#else + using search1 = SearchAlgorithm; + bwd_result1 = search1::Find(dev_ctx, args1, false, deterministic, false); + workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo); +#endif + + ddfilter_ = ddfilter.data(); + args2.handle = handle; + args2.idesc.set(transformed_ddout_channel, iwo_group); + args2.wdesc.set(ddfilter, layout, iwo_group); + args2.odesc.set(transformed_x, iwo_group); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations_, + phi::AllowTF32Cudnn(), + c_group); +#ifdef PADDLE_WITH_HIP + using search2 = SearchAlgorithm; + workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); + bwd_result2.algo = + search2::Find(args2, false, deterministic, workspace_size, dev_ctx); +#else + using search2 = SearchAlgorithm; + bwd_result2 = search2::Find(dev_ctx, args2, false, deterministic, false); + workspace_size = std::max( + workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo)); +#endif + } + + if (dfilter) { + dfilter_ = dfilter->data(); + + args3.idesc.set(transformed_dout, iwo_group); + args3.wdesc.set(*dfilter, layout, iwo_group); + args3.odesc.set(transformed_ddx_channel, iwo_group); + args3.cdesc.set(dtype, + padding_common, + strides, + dilations_, + phi::AllowTF32Cudnn(), + c_group); +#ifdef PADDLE_WITH_HIP + using search3 = SearchAlgorithm; + workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); + filter_result.algo = + search3::Find(args3, false, deterministic, workspace_size, dev_ctx); +#else + using search3 = SearchAlgorithm; + filter_result = + search3::Find(dev_ctx, args3, false, deterministic, false); + workspace_size = std::max( + workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); +#endif + } + + if (dx) { + transformed_dx_ = transformed_dx_channel.data(); + + args4.handle = handle; + args4.idesc.set(transformed_dout, iwo_group); + args4.wdesc.set(ddfilter, layout, iwo_group); + args4.odesc.set(transformed_dx_channel, iwo_group); + args4.cdesc.set(dtype, + padding_common, + strides, + dilations_, + phi::AllowTF32Cudnn(), + c_group); +#ifdef PADDLE_WITH_HIP + using search4 = SearchAlgorithm; + workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); + fwd_result.algo = + search4::Find(args4, false, deterministic, workspace_size, dev_ctx); +#else + using search4 = SearchAlgorithm; + fwd_result = search4::Find(dev_ctx, args4, false, deterministic, false); + workspace_size = std::max( + workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo)); +#endif + } + + int i_n, i_c, i_d, i_h, i_w; + GetNCDHW(transformed_x.dims(), + GPUDNNDataLayout::kNCHW, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + + int o_n, o_c, o_d, o_h, o_w; + GetNCDHW(transformed_dout.dims(), + GPUDNNDataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + + int group_offset_in = + transformed_x.numel() / transformed_x.dims()[0] / groups; + int group_offset_out = + transformed_dout.numel() / transformed_dout.dims()[0] / groups; + int group_offset_filter = filter.numel() / groups; + + ScalingParamType alpha = 1.0f; + ScalingParamType beta = 0.0f; + + // auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = GetDnnWorkspace( + const_cast(&(dev_ctx.GetAllocator())), dev_ctx.stream()); + if (ddout) { + ddx_ = transformed_ddx.data(); +#ifdef PADDLE_WITH_HIP + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args1.odesc.desc(), + ddx_ + i * group_offset_in, + args1.wdesc.desc(), + filter_ + i * group_offset_filter, + args1.cdesc.desc(), + bwd_result1.algo, + &beta, + args1.idesc.desc(), + transformed_ddout_channel_ + i * group_offset_out, + workspace_ptr, + workspace_size)); + }, + workspace_size); + } +#else // PADDLE_WITH_HIP + ConvRunner::Apply(dev_ctx, + args1, + bwd_result1, + ddx_, + filter_, + transformed_ddout_channel_, + groups, + group_offset_out, + group_offset_filter, + group_offset_in, + workspace_size, + &workspace_handle, + false); +#endif // PADDLE_WITH_HIP + +#ifdef PADDLE_WITH_HIP + for (int i = 0; i < groups; i++) { + // MIOPEN ONLY support beta to be 0.0f + DenseTensor conv_x_ddfilter(dout.type()); + conv_x_ddfilter.Resize(transformed_ddout_channel.dims()); + T* conv_x_ddfilter_data = dev_ctx.template Alloc(&conv_x_ddfilter); + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args2.odesc.desc(), + x_ + i * group_offset_in, + args2.wdesc.desc(), + ddfilter_ + i * group_offset_filter, + args2.cdesc.desc(), + bwd_result2.algo, + &beta, + args2.idesc.desc(), + conv_x_ddfilter_data + i * group_offset_out, + workspace_ptr, + workspace_size)); + }, + workspace_size); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenOpTensor( + handle, + miopenTensorOpAdd, + &alpha, + args2.idesc.desc(), + transformed_ddout_channel_ + i * group_offset_out, + &alpha, + args2.idesc.desc(), + conv_x_ddfilter_data + i * group_offset_out, + &beta, + args2.idesc.desc(), + transformed_ddout_channel_ + i * group_offset_out)); + } +#else // PADDLE_WITH_HIP + ConvRunner::Apply(dev_ctx, + args2, + bwd_result2, + x_, + ddfilter_, + transformed_ddout_channel_, + groups, + group_offset_out, + group_offset_filter, + group_offset_in, + workspace_size, + &workspace_handle, + true); +#endif // PADDLE_WITH_HIP + + if ((!is_sys_pad) && (!channel_last)) { + if (strides.size() == 2U) { + funcs::Slice( + dev_ctx, &transformed_ddout_channel, ddout, starts, ends, axes); + } else if (!is_sys_pad && strides.size() == 3U) { + funcs::Slice( + dev_ctx, &transformed_ddout_channel, ddout, starts, ends, axes); + } + } else if ((!is_sys_pad) && (channel_last)) { + if (strides.size() == 2U) { + funcs::Slice(dev_ctx, + &transformed_ddout_channel, + &transformed_ddout_channel, + starts, + ends, + axes); + } else if (!is_sys_pad && strides.size() == 3U) { + funcs::Slice(dev_ctx, + &transformed_ddout_channel, + &transformed_ddout_channel, + starts, + ends, + axes); + } + + TransToChannelLast( + dev_ctx, &transformed_ddout_channel, ddout); + } + } + + T* transformed_dout_channel_ = transformed_dout.data(); + if (dfilter) { + ddx_ = transformed_ddx_channel.data(); +#ifdef PADDLE_WITH_HIP + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::miopenConvolutionBackwardWeights( + handle, + &alpha, + args3.odesc.desc(), + ddx_ + i * group_offset_in, + args3.idesc.desc(), + transformed_dout_channel_ + i * group_offset_out, + args3.cdesc.desc(), + filter_result.algo, + &beta, + args3.wdesc.desc(), + dfilter_ + i * group_offset_filter, + workspace_ptr, + workspace_size)); + }, + workspace_size); + } +#else // PADDLE_WITH_HIP + ConvRunner::Apply(dev_ctx, + args3, + filter_result, + ddx_, + transformed_dout_channel_, + dfilter_, + groups, + group_offset_out, + group_offset_filter, + group_offset_in, + workspace_size, + &workspace_handle, + false); +#endif // PADDLE_WITH_HIP + } + + if (dx) { + ddfilter_ = ddfilter.data(); +#ifdef PADDLE_WITH_HIP + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionForward( + handle, + &alpha, + args4.idesc.desc(), + transformed_dout_channel_ + i * group_offset_out, + args4.wdesc.desc(), + ddfilter_ + i * group_offset_filter, + args4.cdesc.desc(), + fwd_result.algo, + &beta, + args4.odesc.desc(), + transformed_dx_ + i * group_offset_in, + workspace_ptr, + workspace_size)); + }, + workspace_size); + } +#else // PADDLE_WITH_HIP + ConvRunner::Apply(dev_ctx, + args4, + fwd_result, + transformed_dout_channel_, + ddfilter_, + transformed_dx_, + groups, + group_offset_out, + group_offset_filter, + group_offset_in, + workspace_size, + &workspace_handle, + false); +#endif // PADDLE_WITH_HIP + + if (channel_last) { + TransToChannelLast(dev_ctx, &transformed_dx_channel, dx); + } + } +} + +template +void Conv3dTransposeGradGPUDNNKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const std::vector& strides, + const std::vector& paddings_, + const std::vector& output_padding, + const std::vector& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter) { + ConvTransposeGradRawGPUDNNKernel(dev_ctx, + x, + filter, + dout, + strides, + paddings_, + padding_algorithm, + groups, + dilations_, + data_format, + dx, + dfilter); +} + +} // namespace phi + +using float16 = phi::dtype::float16; + +PD_REGISTER_PLUGIN_KERNEL(conv2d_transpose_grad, + metax_gpu, + ALL_LAYOUT, + phi::Conv2dTransposeGradGPUDNNKernel, + float, + double, + float16, + phi::dtype::bfloat16) {} +PD_REGISTER_PLUGIN_KERNEL(conv2d_transpose_double_grad, + metax_gpu, + ALL_LAYOUT, + phi::Conv2dTransposeDoubleGradGPUDNNKernel, + float, + double, + float16, + phi::dtype::bfloat16) {} +PD_REGISTER_PLUGIN_KERNEL(conv3d_transpose_grad, + metax_gpu, + ALL_LAYOUT, + phi::Conv3dTransposeGradGPUDNNKernel, + float, + double, + float16, + phi::dtype::bfloat16) {} diff --git a/backends/metax_gpu/kernels/impl/spectral_norm_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/spectral_norm_grad_kernel_impl.h deleted file mode 100644 index 03651be95c3..00000000000 --- a/backends/metax_gpu/kernels/impl/spectral_norm_grad_kernel_impl.h +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "kernels/impl/spectral_norm_kernel_impl.h" - -namespace phi { - -template -void SpectralNormGradKernel(const Context& dev_ctx, - const DenseTensor& weight, - const DenseTensor& u, - const DenseTensor& v, - const DenseTensor& out_grad, - int dim, - int power_iters, - float eps, - DenseTensor* weight_grad) { - auto& place = *dev_ctx.eigen_device(); - auto blas = phi::funcs::GetBlas(dev_ctx); - - const int h = u.dims()[0]; - const int w = v.dims()[0]; - - DenseTensor weight_mat, out_grad_mat; - auto dims = weight.dims(); - const int rank = dims.size(); - std::vector real_dims; - if (dim != 0) { - std::vector perm; - perm.push_back(dim); - real_dims.push_back(dims[dim]); - for (int i = 0; i < rank; i++) { - if (i != dim) { - perm.push_back(i); - real_dims.push_back(dims[i]); - } - } - weight_mat.Resize(common::make_ddim(real_dims)); - dev_ctx.template Alloc(&weight_mat); - out_grad_mat.Resize(common::make_ddim(real_dims)); - dev_ctx.template Alloc(&out_grad_mat); - TransCompute2DTo5D(dev_ctx, weight, rank, perm, &weight_mat); - TransCompute2DTo5D( - dev_ctx, out_grad, rank, perm, &out_grad_mat); - } else { - for (int i = 0; i < rank; i++) { - real_dims.push_back(i); - } - phi::Copy(dev_ctx, weight, dev_ctx.GetPlace(), true, &weight_mat); - phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), true, &out_grad_mat); - } - weight_mat = weight_mat.Resize({h, w}); - out_grad_mat = out_grad_mat.Resize({h, w}); - - DenseTensor sigma; - sigma.Resize(weight_mat.dims()); - dev_ctx.template Alloc(&sigma); - DenseTensor uu, vv; - phi::Copy(dev_ctx, u, dev_ctx.GetPlace(), true, &uu); - phi::Copy(dev_ctx, v, dev_ctx.GetPlace(), true, &vv); - CalcMatrixSigmaAndNormWeight(dev_ctx, - &weight_mat, - &(uu.Resize({h, 1})), - &(vv.Resize({w, 1})), - &sigma, - power_iters, - eps); - - DenseTensor uv; - uv.Resize({h, w}); - dev_ctx.template Alloc(&uv); - blas.MatMul( - uu.Resize({h, 1}), false, vv.Resize({w, 1}), false, T(1), &uv, T(0)); - - DenseTensor weight_grad_mat; - weight_grad_mat.Resize({h, w}); - dev_ctx.template Alloc(&weight_grad_mat); - auto weight_grad_mat_t = EigenTensor::From(weight_grad_mat); - auto weight_mat_t = EigenTensor::From(weight_mat); - auto out_grad_mat_t = EigenTensor::From(out_grad_mat); - auto sigma_t = EigenTensor::From(sigma); - auto uv_t = EigenTensor::From(uv); - weight_mat_t.device(place) = - weight_mat_t.sum().eval().reshape(Array2(1, 1)).broadcast(Array2(h, w)); - weight_grad_mat_t.device(place) = - out_grad_mat_t * (out_grad_mat_t.constant(1.0) - uv_t * weight_mat_t) / - sigma_t; - - if (dim != 0) { - std::vector perm; - for (int i = 0; i < rank; i++) { - if (i < dim) { - perm.push_back(i + 1); - } else if (i == dim) { - perm.push_back(0); - } else { - perm.push_back(i); - } - } - weight_grad->Resize(dims); - dev_ctx.template Alloc(weight_grad); - TransCompute2DTo5D( - dev_ctx, - weight_grad_mat.Resize(common::make_ddim(real_dims)), - rank, - perm, - weight_grad); - } else { - phi::Copy(dev_ctx, - weight_grad_mat.Resize(dims), - dev_ctx.GetPlace(), - true, - weight_grad); - } -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/spectral_norm_kernel_impl.h b/backends/metax_gpu/kernels/impl/spectral_norm_kernel_impl.h deleted file mode 100644 index 8c9fc548259..00000000000 --- a/backends/metax_gpu/kernels/impl/spectral_norm_kernel_impl.h +++ /dev/null @@ -1,182 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "kernels/funcs/blas/blas.h" -#include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace phi { - -using Array1 = Eigen::DSizes; -using Array2 = Eigen::DSizes; -using IndexPair = Eigen::IndexPair; - -template -static inline void TransCompute2DTo5D(const Context& dev_ctx, - const DenseTensor& in, - const int rank, - const std::vector& perm, - DenseTensor* out) { - if (rank <= 1 || rank > 5) { - PADDLE_THROW(common::errors::Fatal( - "Weight rank of SpectralNorm should be in range [2, 5], but got %d.", - rank)); - } - - switch (rank) { - case 2: - phi::funcs::Transpose trans2; - trans2(dev_ctx, in, out, perm); - break; - case 3: - phi::funcs::Transpose trans3; - trans3(dev_ctx, in, out, perm); - break; - case 4: - phi::funcs::Transpose trans4; - trans4(dev_ctx, in, out, perm); - break; - case 5: - phi::funcs::Transpose trans5; - trans5(dev_ctx, in, out, perm); - break; - default: - break; - } -} - -template -static inline void CalcMatrixSigmaAndNormWeight(const Context& dev_ctx, - DenseTensor* weight, - DenseTensor* u, - DenseTensor* v, - DenseTensor* sigma, - const int power_iters, - const float eps) { - auto& place = *dev_ctx.eigen_device(); - auto blas = funcs::GetBlas(dev_ctx); - auto sigma_t = EigenTensor::From(*sigma); - auto weight_t = EigenTensor::From(*weight); - auto u_t = EigenTensor::From(*u); - auto v_t = EigenTensor::From(*v); - - const int h = weight->dims()[0]; - const int w = weight->dims()[1]; - - for (int i = 0; i < power_iters; i++) { - // V = W^T * U / ||W^T * U||_2 - blas.MatMul(*weight, true, *u, false, T(1), v, T(0)); - auto v_t_norm = - v_t.square().sum().sqrt().eval().reshape(Array1(1)).broadcast( - Array1(w)); - v_t.device(place) = v_t / (v_t_norm + v_t_norm.constant(eps)); - // U = W^T * V / ||W^T * V||_2 - blas.MatMul(*weight, false, *v, false, T(1), u, T(0)); - auto u_t_norm = - u_t.square().sum().sqrt().eval().reshape(Array1(1)).broadcast( - Array1(h)); - u_t.device(place) = u_t / (u_t_norm + u_t_norm.constant(eps)); - } - DenseTensor weight_v; - weight_v.Resize({h, 1}); - dev_ctx.template Alloc(&weight_v); - blas.MatMul(*weight, false, *v, false, T(1), &weight_v, T(0)); - auto weight_v_t = EigenTensor::From(weight_v); - sigma_t.device(place) = (u_t * weight_v_t) - .sum() - .eval() - .reshape(Array2(1, 1)) - .broadcast(Array2(h, w)); - weight_t.device(place) = weight_t / sigma_t; -} - -template -void SpectralNormKernel(const Context& dev_ctx, - const DenseTensor& weight, - const DenseTensor& u, - const DenseTensor& v, - int dim, - int power_iters, - float eps, - DenseTensor* out) { - const int h = u.dims()[0]; - const int w = v.dims()[0]; - - DenseTensor weight_mat; - auto dims = weight.dims(); - const int rank = dims.size(); - std::vector real_dims; - if (dim != 0) { - std::vector perm; - perm.push_back(dim); - real_dims.push_back(dims[dim]); - for (int i = 0; i < rank; i++) { - if (i != dim) { - perm.push_back(i); - real_dims.push_back(dims[i]); - } - } - weight_mat.Resize(common::make_ddim(real_dims)); - dev_ctx.template Alloc(&weight_mat); - TransCompute2DTo5D(dev_ctx, weight, rank, perm, &weight_mat); - } else { - for (int i = 0; i < rank; i++) { - real_dims.push_back(i); - } - phi::Copy(dev_ctx, weight, dev_ctx.GetPlace(), true, &weight_mat); - } - weight_mat = weight_mat.Resize({h, w}); - - DenseTensor sigma; - sigma.Resize(weight_mat.dims()); - dev_ctx.template Alloc(&sigma); - DenseTensor uu, vv; - phi::Copy(dev_ctx, u, dev_ctx.GetPlace(), true, &uu); - phi::Copy(dev_ctx, v, dev_ctx.GetPlace(), true, &vv); - CalcMatrixSigmaAndNormWeight(dev_ctx, - &weight_mat, - &(uu.Resize({h, 1})), - &(vv.Resize({w, 1})), - &sigma, - power_iters, - eps); - - if (dim != 0) { - std::vector perm; - for (int i = 0; i < rank; i++) { - if (i < dim) { - perm.push_back(i + 1); - } else if (i == dim) { - perm.push_back(0); - } else { - perm.push_back(i); - } - } - out->Resize(dims); - dev_ctx.template Alloc(out); - TransCompute2DTo5D( - dev_ctx, - weight_mat.Resize(common::make_ddim(real_dims)), - rank, - perm, - out); - } else { - phi::Copy(dev_ctx, weight_mat.Resize(dims), dev_ctx.GetPlace(), true, out); - } -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/metax_context.cc b/backends/metax_gpu/kernels/metax_context.cc index 9bd26a170c5..4df4d88b0b4 100644 --- a/backends/metax_gpu/kernels/metax_context.cc +++ b/backends/metax_gpu/kernels/metax_context.cc @@ -15,6 +15,7 @@ #include "kernels/metax_context.h" namespace phi { +bool AllowTF32Cudnn() { return false; } void DnnWorkspaceHandle::RunFuncSync( const std::function& cudnn_func, size_t required_workspace_bytes, diff --git a/backends/metax_gpu/kernels/metax_context.h b/backends/metax_gpu/kernels/metax_context.h index 21e9084a977..5974aadcc41 100644 --- a/backends/metax_gpu/kernels/metax_context.h +++ b/backends/metax_gpu/kernels/metax_context.h @@ -128,6 +128,7 @@ inline void InitCusolverDnHandle(cusolverDnHandle_t* handle, } } +bool AllowTF32Cudnn(); inline cusolverDnHandle_t GetCusolverDnHandle(gpuStream_t stream, Place place) { std::call_once(flag_cusolver_dn_, [&]() { if (!cusolver_dn_handle_) { diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu new file mode 100644 index 00000000000..d7540d949a9 --- /dev/null +++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu @@ -0,0 +1,650 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "glog/logging.h" +#include "kernels/metax_context.h" +#include "paddle/common/layout.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/norm_utils.h" +#include "paddle/phi/kernels/gpu/instance_norm_utils.h" +#include "paddle/phi/kernels/instance_norm_grad_kernel.h" + +namespace phi { +template +static __global__ void GradComputeDX(const T *dy, + const BatchNormParamType *scale, + const BatchNormParamType *mean, + const T *x, + const BatchNormParamType *variance, + const int C, + const int sample_size, + T *dx) { + int beg_idx = blockIdx.x * sample_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * sample_size; + int ncid = blockIdx.x; + int c = ncid % C; + BatchNormParamType mean_val = mean[ncid]; + BatchNormParamType inv_var_val = variance[ncid]; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage dy_storage; + __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage; + __shared__ BatchNormParamType dy_sum_val; + __shared__ BatchNormParamType dy_x_sub_mean_sum_val; + BatchNormParamType dy_sum = static_cast>(0); + BatchNormParamType dy_x_sub_mean_sum = + static_cast>(0); + + for (int i = beg_idx; i < end_idx; i += BlockDim) { + BatchNormParamType dy_i = static_cast>(dy[i]); + dy_sum += dy_i; + dy_x_sub_mean_sum += + dy_i * (static_cast>(x[i]) - mean_val); + } + dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); + dy_x_sub_mean_sum = + BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum()); + if (threadIdx.x == 0) { + dy_sum_val = dy_sum; + dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; + } + __syncthreads(); + for (int i = beg_idx; i < end_idx; i += BlockDim) { + dx[i] = static_cast( + (static_cast>(dy[i]) - + dy_sum_val / static_cast>(sample_size) - + (static_cast>(x[i]) - mean_val) * + dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) * + scale[c] * inv_var_val); + } +} + +static __device__ __forceinline__ float real_sqrt(float x) { + return 1. / sqrtf(x); +} +static __device__ __forceinline__ double real_sqrt(double x) { + return 1. / sqrt(x); +} + +template +__global__ void DoubleGradComputeDX(const T *x, + const AccT *mean, + const AccT *variance, + const T *ddx, + const T *dy, + const AccT *scale, + const AccT *ddscale, + int C, + int sample_size, + const double epsilon, + T *dx) { + int beg_idx = blockIdx.x * sample_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * sample_size; + int ncid = blockIdx.x; + int c = ncid % C; + + AccT mean_val = mean[ncid]; + AccT var_val = variance[ncid]; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage dy_storage; + __shared__ typename BlockReduce::TempStorage ddx_storage; + __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage; + __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage; + __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage; + __shared__ AccT dy_sum_val; + __shared__ AccT ddx_sum_val; + __shared__ AccT dy_mul_ddx_sum_val; + __shared__ AccT dy_mul_x_sub_mean_sum_val; + __shared__ AccT ddx_mul_x_sub_mean_sum_val; + + AccT dy_sum = 0; + AccT ddx_sum = 0; + AccT dy_mul_ddx_sum = 0; + AccT dy_mul_x_sub_mean_sum = 0; + AccT ddx_mul_x_sub_mean_sum = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + AccT ddx_i = static_cast(ddx[i]); + AccT dy_i = static_cast(dy[i]); + AccT tmp = static_cast(x[i]) - mean_val; + + dy_sum += dy_i; + ddx_sum += ddx_i; + dy_mul_ddx_sum += (ddx_i * dy_i); + + dy_mul_x_sub_mean_sum += (dy_i * tmp); + ddx_mul_x_sub_mean_sum += (ddx_i * tmp); + } + + dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); + ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum()); + dy_mul_ddx_sum = + BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum()); + dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage) + .Reduce(dy_mul_x_sub_mean_sum, cub::Sum()); + ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage) + .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum()); + + if (threadIdx.x == 0) { + dy_sum_val = dy_sum; + ddx_sum_val = ddx_sum; + dy_mul_ddx_sum_val = dy_mul_ddx_sum; + dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum; + ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum; + } + __syncthreads(); + + if (ddx != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + AccT tmp = static_cast(dx[i]); + tmp += + ((static_cast(x[i]) - mean_val) * var_val * var_val * var_val / + sample_size * + (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val + + 3. * dy_mul_x_sub_mean_sum_val * var_val * + ddx_mul_x_sub_mean_sum_val * var_val / sample_size) + + ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val * + var_val * (dy_sum_val / sample_size - static_cast(dy[i])) + + dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val * + var_val * + (ddx_sum_val / sample_size - static_cast(ddx[i]))) * + scale[c]; + dx[i] = static_cast(tmp); + } + } + __syncthreads(); + if (ddscale != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + AccT tmp = static_cast(dx[i]); + tmp += (static_cast(dy[i]) * var_val - + dy_sum_val / sample_size * var_val - + (static_cast(x[i]) - mean_val) * var_val * + dy_mul_x_sub_mean_sum_val * var_val / sample_size) * + ddscale[c]; + dx[i] = static_cast(tmp); + } + } +} + +template +__global__ void DoubleGradComputeDDY(const T *x, + const AccT *mean, + const AccT *variance, + const AccT *ddscale, + const AccT *ddbias, + const T *ddx, + const AccT *scale, + int C, + int sample_size, + const double epsilon, + T *ddy) { + int beg_idx = blockIdx.x * sample_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * sample_size; + int ncid = blockIdx.x; + int c = ncid % C; + AccT mean_val = mean[ncid]; + AccT var_val = variance[ncid]; + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage ddx_storage; + __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage; + __shared__ AccT ddx_sum_val; + __shared__ AccT ddx_mul_x_sub_mean_sum_val; + + AccT ddx_sum = 0; + AccT ddx_mul_x_sub_mean_sum = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + AccT ddx_i = static_cast(ddx[i]); + ddx_sum += ddx_i; + ddx_mul_x_sub_mean_sum += (ddx_i * (static_cast(x[i]) - mean_val)); + } + ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum()); + ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage) + .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum()); + if (threadIdx.x == 0) { + ddx_sum_val = ddx_sum; + ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum; + } + __syncthreads(); + if (ddx != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + AccT tmp = static_cast(ddy[i]); + tmp += scale[c] * var_val * + (static_cast(ddx[i]) - ddx_sum_val / sample_size - + (static_cast(x[i]) - mean_val) * var_val * + ddx_mul_x_sub_mean_sum_val * var_val / sample_size); + ddy[i] = static_cast(tmp); + } + } + __syncthreads(); + if (ddscale != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + AccT tmp = static_cast(ddy[i]); + tmp += (static_cast(x[i]) - mean_val) * var_val * ddscale[c]; + ddy[i] = static_cast(tmp); + } + } + __syncthreads(); + if (ddbias != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + ddy[i] = static_cast(static_cast(ddy[i]) + ddbias[c]); + } + } +} + +template +__global__ void DoubleGradComputeDScale(const T *x, + const AccT *mean, + const AccT *variance, + const T *ddx, + const T *dy, + int C, + int sample_size, + const double epsilon, + AccT *dscale) { + int beg_idx = blockIdx.x * sample_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * sample_size; + int ncid = blockIdx.x; + int c = ncid % C; + AccT mean_val = mean[ncid]; + AccT var_val = variance[ncid]; + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage dy_storage; + __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage; + __shared__ typename BlockReduce::TempStorage dscale_tmp_storage; + __shared__ AccT dy_sum_val; + __shared__ AccT dy_mul_x_sub_mean_sum_val; + + AccT dy_sum = 0; + AccT dy_mul_x_sub_mean_sum = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + AccT dy_i = static_cast(dy[i]); + dy_sum += dy_i; + dy_mul_x_sub_mean_sum += (dy_i * (static_cast(x[i]) - mean_val)); + } + dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); + dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage) + .Reduce(dy_mul_x_sub_mean_sum, cub::Sum()); + + if (threadIdx.x == 0) { + dy_sum_val = dy_sum; + dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum; + } + __syncthreads(); + if (ddx != nullptr) { + AccT dscale_tmp = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + dscale_tmp += + static_cast(ddx[i]) * var_val * + (static_cast(dy[i]) - dy_sum_val / sample_size - + dy_mul_x_sub_mean_sum_val * (static_cast(x[i]) - mean_val) * + var_val * var_val / sample_size); + } + dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum()); + if (threadIdx.x == 0) { + dscale[ncid] += dscale_tmp; + } + __syncthreads(); + } +} + +template +void InstanceNormGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const paddle::optional &scale, + const paddle::optional &bias UNUSED, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + const DenseTensor &d_y, + float epsilon_f, + DenseTensor *d_x, + DenseTensor *d_scale, + DenseTensor *d_bias) { + using AccT = typename phi::dtype::MPTypeTrait::Type; + double epsilon = static_cast(epsilon_f); + const auto *scale_ptr = scale.get_ptr(); + + const auto &x_dims = x.dims(); + + int N, C, H, W, D; + funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + int NxC = N * C; + + DenseTensor x_tmp, d_y_tmp; + x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D}); + d_y_tmp.ShareDataWith(d_y).Resize({1, NxC, H, W, D}); + + phi::funcs::SetConstant set_constant; + + dev_ctx.template Alloc(d_x); + if (x.numel() == 0) { + if (d_scale) { + dev_ctx.template Alloc(d_scale); + set_constant(dev_ctx, d_scale, static_cast(0)); + } + if (d_bias) { + dev_ctx.template Alloc(d_bias); + set_constant(dev_ctx, d_bias, static_cast(0)); + } + return; + } + if (d_scale && d_bias) { + dev_ctx.template Alloc(d_scale); + dev_ctx.template Alloc(d_bias); + } + + if (scale_ptr) { + PADDLE_ENFORCE_EQ( + scale_ptr->dims().size(), + 1UL, + common::errors::InvalidArgument( + "The `shape` in InstanceNormOp is invalid: " + "the size of scale's dimensions must be equal to 1. But " + "received: the size of scale's dimensions" + "is [%d]", + scale_ptr->dims().size())); + PADDLE_ENFORCE_EQ(scale_ptr->dims()[0], + C, + common::errors::InvalidArgument( + "The `shape` in InstanceNormOp is invalid: " + "the first dimension of scale must be equal to " + "Channels([%d]). But received: " + "the first dimension of scale is [%d]," + "the dimensions of scale is [%s], ", + C, + scale_ptr->dims()[0], + scale_ptr->dims())); + } + + const int n = x.numel(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min(NxC, max_blocks); + const int grid1 = (C + block - 1) / block; + + DenseTensor scale_tmp; + scale_tmp.Resize({NxC}); + dev_ctx.template Alloc(&scale_tmp); + + DenseTensor d_scale_tmp; + d_scale_tmp.Resize({NxC}); + dev_ctx.template Alloc(&d_scale_tmp); + + DenseTensor d_bias_tmp; + d_bias_tmp.Resize({NxC}); + dev_ctx.template Alloc(&d_bias_tmp); + if (scale_ptr) { + repeat_param<<>>( + scale_ptr->data(), scale_tmp.data(), N, C); + } else { + set_constant(dev_ctx, &scale_tmp, static_cast(1)); + } + std::vector dims; + std::vector strides; + dims = {1, NxC, H, W, D}; + strides = {NxC * H * W * D, H * W * D, W * D, D, 1}; + +#ifdef PADDLE_WITH_HIP + miopenTensorDescriptor_t data_desc_; + miopenTensorDescriptor_t in_param_desc_; + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_)); +#else + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t in_param_desc_; + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); +#endif + + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); + +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + const_cast(dims.data()), + const_cast(strides.data()))); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, miopenBNSpatial)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + dims.data(), + strides.data())); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); +#endif + const auto *saved_mean_data = + saved_mean.template data>(); + const auto *saved_var_data = + saved_variance.template data>(); + + if (d_scale && d_bias) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenBatchNormalizationBackward( + GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()), + miopenBNSpatial, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + x_tmp.template data(), + data_desc_, + d_y_tmp.template data(), + data_desc_, + d_x->template data(), + in_param_desc_, + scale_tmp.template data>(), + d_scale_tmp.template data>(), + d_bias_tmp.template data>(), + epsilon, + saved_mean_data, + saved_var_data)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackward( + GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()), + CUDNN_BATCHNORM_SPATIAL, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + x_tmp.template data(), + data_desc_, + d_y_tmp.template data(), + data_desc_, + d_x->template data(), + in_param_desc_, + scale_tmp.template data>(), + d_scale_tmp.template data>(), + d_bias_tmp.template data>(), + epsilon, + saved_mean_data, + saved_var_data)); +#endif + } else { + if (d_x) { + GradComputeDX<<>>( + d_y.data(), + scale_tmp.data>(), + saved_mean_data, + x.data(), + saved_var_data, + C, + H * W * D, + d_x->data()); + } + } + if (d_scale && d_bias) { + add_param<<>>( + d_scale_tmp.data(), d_scale->data(), N, C); + add_param<<>>( + d_bias_tmp.data(), d_bias->data(), N, C); + } + +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); +#endif +} + +template +void InstanceNormDoubleGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const paddle::optional &scale, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + const DenseTensor &dy, + const paddle::optional &ddx, + const paddle::optional &ddscale, + const paddle::optional &ddbias, + float epsilon_f, + DenseTensor *dx, + DenseTensor *dscale, + DenseTensor *ddy) { + using AccT = typename phi::dtype::MPTypeTrait::Type; + const auto *Scale = scale.get_ptr(); + const auto *ddX = ddx.get_ptr(); + const auto *ddScale = ddscale.get_ptr(); + const auto *ddBias = ddbias.get_ptr(); + const double epsilon = static_cast(epsilon_f); + const T *x_data = x.data(); + const T *dy_data = dy.data(); + const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data()); + const AccT *ddscale_data = + (ddScale == nullptr ? nullptr : ddScale->data()); + const AccT *ddbias_data = + (ddScale == nullptr ? nullptr : ddBias->data()); + const AccT *mean_data = saved_mean.data(); + const AccT *variance_data = saved_variance.data(); + phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero_AccT; + + auto &x_dims = x.dims(); + int N, C, H, W, D; + funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + int NxC = N * C; + const int n = x.numel(); + int sample_size = n / N / C; + + DenseTensor scale_tmp; + if (!Scale) { + scale_tmp.Resize({C}); + dev_ctx.template Alloc(&scale_tmp); + set_zero_AccT(dev_ctx, &scale_tmp, static_cast(1)); + } + const AccT *scale_data = Scale ? Scale->data() : scale_tmp.data(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = NxC; + const int grid1 = (C + block - 1) / block; + + if (dx) { + T *dx_data = dev_ctx.template Alloc(dx); + set_zero(dev_ctx, dx, static_cast(0)); + DoubleGradComputeDX + <<>>(x_data, + mean_data, + variance_data, + ddx_data, + dy_data, + scale_data, + ddscale_data, + C, + sample_size, + epsilon, + dx_data); + } + if (dscale) { + DenseTensor dscale_tmp; + dscale_tmp.Resize({NxC}); + dev_ctx.template Alloc(&dscale_tmp); + set_zero_AccT(dev_ctx, &dscale_tmp, static_cast(0)); + AccT *dscale_tmp_data = dscale_tmp.data(); + + AccT *dscale_data = dev_ctx.template Alloc(dscale); + set_zero_AccT(dev_ctx, dscale, static_cast(0)); + DoubleGradComputeDScale + <<>>(x_data, + mean_data, + variance_data, + ddx_data, + dy_data, + C, + sample_size, + epsilon, + dscale_tmp_data); + add_param<<>>( + dscale_tmp.data(), dscale->data(), N, C); + } + if (ddy) { + T *ddy_data = dev_ctx.template Alloc(ddy); + set_zero(dev_ctx, ddy, static_cast(0)); + DoubleGradComputeDDY + <<>>(x_data, + mean_data, + variance_data, + ddscale_data, + ddbias_data, + ddx_data, + scale_data, + C, + sample_size, + epsilon, + ddy_data); + } +} +} // namespace phi + +PD_REGISTER_PLUGIN_KERNEL(instance_norm_grad, + metax_gpu, + ALL_LAYOUT, + phi::InstanceNormGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +PD_REGISTER_PLUGIN_KERNEL(instance_norm_double_grad, + metax_gpu, + ALL_LAYOUT, + phi::InstanceNormDoubleGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu new file mode 100644 index 00000000000..db975d74665 --- /dev/null +++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu @@ -0,0 +1,253 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "glog/logging.h" +#include "kernels/metax_context.h" +#include "paddle/common/layout.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/norm_utils.h" +#include "paddle/phi/kernels/gpu/instance_norm_utils.h" +#include "paddle/phi/kernels/instance_norm_kernel.h" + +namespace phi { + +template +void InstanceNormKernel(const Context &dev_ctx, + const DenseTensor &x, + const paddle::optional &scale, + const paddle::optional &bias, + float epsilon_f, + DenseTensor *y, + DenseTensor *saved_mean, + DenseTensor *saved_variance) { + using AccT = typename phi::dtype::MPTypeTrait::Type; + double epsilon = static_cast(epsilon_f); + auto &x_dims = x.dims(); + PADDLE_ENFORCE_GE(x_dims.size(), + 2, + common::errors::InvalidArgument( + "The `shape` in InstanceNormOp is invalid: " + "the size of X's dimensions must greater than " + "or equal to 2. But received: " + "the size of X's dimensions is [%d]", + x_dims.size())); + PADDLE_ENFORCE_LE(x_dims.size(), + 5, + common::errors::InvalidArgument( + "The `shape` in InstanceNormOp is invalid: " + "the size of X's dimensions must smaller than" + "or equal to 5. But received: " + "the size of X's dimensions is [%d]", + x_dims.size())); + int N, C, H, W, D; + funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + int NxC = N * C; + DenseTensor x_tmp; + x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D}); + dev_ctx.template Alloc(y); + phi::funcs::SetConstant> functor; + phi::funcs::SetConstant functor_y; + if (x.numel() == 0) { + functor_y(dev_ctx, y, static_cast(0)); + if (saved_mean) { + dev_ctx.template Alloc>(saved_mean); + functor(dev_ctx, saved_mean, static_cast>(0)); + } + if (saved_variance) { + dev_ctx.template Alloc>(saved_variance); + functor(dev_ctx, saved_variance, static_cast>(0)); + } + return; + } + +#ifdef PADDLE_WITH_HIP + miopenTensorDescriptor_t data_desc_; + miopenTensorDescriptor_t in_param_desc_; + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_)); +#else + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t in_param_desc_; + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); +#endif + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); + + VLOG(3) << "Setting descriptors."; + std::vector dims; + std::vector strides; + dims = {1, NxC, H, W, D}; + strides = {NxC * H * W * D, H * W * D, W * D, D, 1}; + +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + const_cast(dims.data()), + const_cast(strides.data()))); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, miopenBNSpatial)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + dims.data(), + strides.data())); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); +#endif + + const auto scale_ptr = scale.get_ptr(); + const auto bias_ptr = bias.get_ptr(); + + DenseTensor scale_tmp; + scale_tmp.Resize({NxC}); + dev_ctx.template Alloc(&scale_tmp); + DenseTensor bias_tmp; + bias_tmp.Resize({NxC}); + dev_ctx.template Alloc(&bias_tmp); + + const int n = x.numel(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min((NxC + block - 1) / block, max_blocks); + + phi::funcs::SetConstant set_constant; + if (scale_ptr) { + repeat_param<<>>( + scale_ptr->data(), scale_tmp.data(), N, C); + } else { + set_constant(dev_ctx, &scale_tmp, static_cast(1)); + } + if (bias_ptr) { + repeat_param<<>>( + bias_ptr->data(), bias_tmp.data(), N, C); + } else { + set_constant(dev_ctx, &bias_tmp, static_cast(0)); + } + + auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); + DenseTensor saved_mean_tmp, saved_variance_tmp; + + if (saved_mean) { + dev_ctx.template Alloc>(saved_mean); + functor(dev_ctx, saved_mean, static_cast>(0)); + } else { + saved_mean_tmp = phi::Full>( + dev_ctx, {NxC}, static_cast>(0)); + } + if (saved_variance) { + dev_ctx.template Alloc>(saved_variance); + functor(dev_ctx, saved_variance, static_cast>(0)); + } else { + saved_variance_tmp = phi::Full>( + dev_ctx, {NxC}, static_cast>(0)); + } + auto *saved_mean_data = saved_mean + ? saved_mean->data>() + : saved_mean_tmp.data>(); + auto *saved_variance_data = + saved_variance ? saved_variance->data>() + : saved_variance_tmp.data>(); + +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenBatchNormalizationForwardTraining( + handle, + miopenBNSpatial, + const_cast( + static_cast(CudnnDataType::kOne())), + const_cast( + static_cast(CudnnDataType::kZero())), + data_desc_, + static_cast(x_tmp.template data()), + data_desc_, + static_cast(y->template data()), + in_param_desc_, + const_cast(static_cast( + scale_tmp.template data>())), + const_cast(static_cast( + bias_tmp.template data>())), + 0, + nullptr, + nullptr, + epsilon, + static_cast(saved_mean_data), + static_cast(saved_variance_data))); + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnBatchNormalizationForwardTraining( + handle, + CUDNN_BATCHNORM_SPATIAL, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + x_tmp.template data(), + data_desc_, + y->template data(), + in_param_desc_, + scale_tmp.template data>(), + bias_tmp.template data>(), + 0, + nullptr, + nullptr, + epsilon, + saved_mean_data, + saved_variance_data)); + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); +#endif +} + +} // namespace phi + +PD_REGISTER_PLUGIN_KERNEL(instance_norm, + metax_gpu, + ALL_LAYOUT, + phi::InstanceNormKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16 || + kernel_key.dtype() == phi::DataType::BFLOAT16) { + kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32); + } +} diff --git a/backends/metax_gpu/kernels/metax_kernel/spectral_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/spectral_norm_grad_kernel_register.cu new file mode 100644 index 00000000000..f99621f8ab9 --- /dev/null +++ b/backends/metax_gpu/kernels/metax_kernel/spectral_norm_grad_kernel_register.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/spectral_norm_grad_kernel.cu" // NOLINT + +PD_CUSTOM_KERNEL_REGISTER(spectral_norm_grad, + metax_gpu, + ALL_LAYOUT, + phi::SpectralNormGradKernel, + float, + double) {} diff --git a/backends/metax_gpu/kernels/metax_kernel/spectral_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/spectral_norm_kernel_register.cu new file mode 100644 index 00000000000..466937f993b --- /dev/null +++ b/backends/metax_gpu/kernels/metax_kernel/spectral_norm_kernel_register.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/spectral_norm_kernel.cu" // NOLINT + +PD_CUSTOM_KERNEL_REGISTER(spectral_norm, + metax_gpu, + ALL_LAYOUT, + phi::SpectralNormKernel, + float, + double) {} diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index 184599263fa..682cee35caf 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -1028,6 +1028,468 @@ index 6f03f76eeb..5fe2c3e7dc 100644 #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" +diff --git a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h +index 4099d8b506..baef2cd643 100644 +--- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h ++++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h +@@ -14,7 +14,7 @@ + + #pragma once + +-#include "paddle/phi/kernels/funcs/blas/blas.h" ++#include "kernels/funcs/blas/blas.h" + #include "paddle/phi/kernels/funcs/eigen/common.h" + #include "paddle/phi/kernels/funcs/math_function.h" + +diff --git a/test/legacy_test/test_batch_norm_op.py b/test/legacy_test/test_batch_norm_op.py +index 4a5660ea0e..ca4e456e02 100644 +--- a/test/legacy_test/test_batch_norm_op.py ++++ b/test/legacy_test/test_batch_norm_op.py +@@ -22,7 +22,9 @@ from op_test import ( + _set_use_system_allocator, + convert_float_to_uint16, + convert_uint16_to_float, +- get_places, ++ get_devices, ++ is_custom_device, ++ get_device_place, + ) + + import paddle +@@ -189,6 +191,7 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format): + + + def create_or_get_tensor(scope, var_name, var, place): ++ + tensor = scope.var(var_name).get_tensor() + if var is not None: + assert isinstance(var, np.ndarray) +@@ -321,7 +324,6 @@ class TestBatchNormOpInference(unittest.TestCase): + fuse_with_relu=self.fuse_with_relu, + epsilon=epsilon, + ) +- + batch_norm_op.run(scope, place) + + # When op is called without Executor then +@@ -454,7 +456,7 @@ class TestBatchNormOpInference(unittest.TestCase): + ) + + def test_check_output(self): +- for place in get_places(): ++ for place in get_devices(): + for data_format in ["NCHW", "NHWC"]: + self.check_with_place( + place, +@@ -488,8 +490,8 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference): + + def test_check_output(self): + places = [] +- if core.is_compiled_with_cuda(): +- place = core.CUDAPlace(0) ++ if core.is_compiled_with_cuda() or is_custom_device(): ++ place = get_device_place() + if core.is_float16_supported(place): + places.append(place) + for place in places: +@@ -510,8 +512,8 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference): + + + @unittest.skipIf( +- not core.is_compiled_with_cuda() +- or not core.is_bfloat16_supported(core.CUDAPlace(0)), ++ not (core.is_compiled_with_cuda() or is_custom_device()) ++ or not core.is_bfloat16_supported(get_device_place()), + "core is not compiled with CUDA or not support the bfloat16", + ) + class TestBF16BatchNormOpInference(TestBatchNormOpInference): +@@ -522,7 +524,7 @@ class TestBF16BatchNormOpInference(TestBatchNormOpInference): + self.init_kernel_type() + + def test_check_output(self): +- places = [core.CUDAPlace(0)] ++ places = [get_device_place()] + for place in places: + # for data_format in ["NCHW", "NHWC"]: + for data_format in ["NCHW"]: +@@ -562,7 +564,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase): + + class TestDygraphBatchNormTrainableStats(unittest.TestCase): + def test_dygraph(self): +- for p in get_places(): ++ for p in get_devices(): + shape = [4, 10, 4, 4] + + def compute(x, is_test, trainable_statistics): +@@ -581,7 +583,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase): + np.testing.assert_allclose(y1, y2, rtol=1e-05) + + def test_static(self): +- for p in get_places(): ++ for p in get_devices(): + exe = base.Executor(p) + shape = [4, 10, 16, 16] + +@@ -625,7 +627,7 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase): + + class TestBatchNormAPI_ZeroSize(unittest.TestCase): + def setUp(self): +- self.places = get_places() ++ self.places = get_devices() + + def test_dygraph(self): + for place in self.places: +diff --git a/test/legacy_test/test_conv3d_transpose_op.py b/test/legacy_test/test_conv3d_transpose_op.py +index c9853e9073..277eb26d00 100644 +--- a/test/legacy_test/test_conv3d_transpose_op.py ++++ b/test/legacy_test/test_conv3d_transpose_op.py +@@ -19,7 +19,7 @@ import numpy as np + import paddle + + paddle.enable_static() +-from op_test import OpTest, copy_bits_from_float_to_uint16 ++from op_test import OpTest, copy_bits_from_float_to_uint16, is_custom_device, get_devices, get_device_place + + from paddle.base import core + +@@ -150,7 +150,7 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs): + + def create_test_cudnn_fp16_class(parent, grad_check=True): + @unittest.skipIf( +- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ++ not ((core.is_compiled_with_cuda() or is_custom_device()) or is_custom_device()), "core is not compiled with CUDA" + ) + class TestConv3DTransposeCUDNNFP16(parent): + def init_kernel_type(self): +@@ -158,20 +158,20 @@ def create_test_cudnn_fp16_class(parent, grad_check=True): + self.dtype = np.float16 + + def test_check_output(self): +- if core.is_compiled_with_cuda(): +- place = core.CUDAPlace(0) ++ if ((core.is_compiled_with_cuda() or is_custom_device()) or is_custom_device()): ++ place = get_device_place() + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-2) + + def test_check_grad_no_filter(self): +- place = core.CUDAPlace(0) ++ place = get_device_place() + if core.is_float16_supported(place) and grad_check: + self.check_grad_with_place( + place, ['Input'], 'Output', no_grad_set={'Filter'} + ) + + def test_check_grad_no_input(self): +- place = core.CUDAPlace(0) ++ place = get_device_place() + if core.is_float16_supported(place) and grad_check: + self.check_grad_with_place( + place, ['Filter'], 'Output', no_grad_set={'Input'} +@@ -184,8 +184,8 @@ def create_test_cudnn_fp16_class(parent, grad_check=True): + + def create_test_cudnn_bf16_class(parent): + @unittest.skipIf( +- not core.is_compiled_with_cuda() +- or not core.is_bfloat16_supported(core.CUDAPlace(0)), ++ not (core.is_compiled_with_cuda() or is_custom_device()) ++ or not core.is_bfloat16_supported(get_device_place()), + "core is not compiled with CUDA and do not support bfloat16", + ) + class TestConv3DTransposeCUDNNBF16(parent): +@@ -194,11 +194,11 @@ def create_test_cudnn_bf16_class(parent): + self.dtype = np.uint16 + + def test_check_output(self): +- place = core.CUDAPlace(0) ++ place = get_device_place() + self.check_output_with_place(place) + + def test_check_grad(self): +- place = core.CUDAPlace(0) ++ place = get_device_place() + self.check_grad_with_place( + place, + {'Input', 'Filter'}, +@@ -206,7 +206,7 @@ def create_test_cudnn_bf16_class(parent): + ) + + def test_check_grad_no_filter(self): +- place = core.CUDAPlace(0) ++ place = get_device_place() + self.check_grad_with_place( + place, + ['Input'], +@@ -215,7 +215,7 @@ def create_test_cudnn_bf16_class(parent): + ) + + def test_check_grad_no_input(self): +- place = core.CUDAPlace(0) ++ place = get_device_place() + self.check_grad_with_place( + place, + ['Filter'], +@@ -306,14 +306,14 @@ class TestConv3DTransposeOp(OpTest): + + def test_check_output(self): + if self.use_cudnn: +- place = core.CUDAPlace(0) ++ place = get_device_place() + self.check_output_with_place(place, atol=1e-5) + else: + self.check_output() + + def test_check_grad(self): + if self.use_cudnn: +- place = core.CUDAPlace(0) ++ place = get_device_place() + self.check_grad_with_place( + place, + {'Input', 'Filter'}, +@@ -327,7 +327,7 @@ class TestConv3DTransposeOp(OpTest): + + def test_check_grad_no_filter(self): + if self.use_cudnn: +- place = core.CUDAPlace(0) ++ place = get_device_place() + self.check_grad_with_place( + place, + ['Input'], +@@ -345,7 +345,7 @@ class TestConv3DTransposeOp(OpTest): + + def test_check_grad_no_input(self): + if self.use_cudnn: +- place = core.CUDAPlace(0) ++ place = get_device_place() + self.check_grad_with_place( + place, + ['Filter'], +@@ -471,7 +471,7 @@ class Test_NHWC(TestConv3DTransposeOp): + + # ------------ test_cudnn ------------ + @unittest.skipIf( +- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ++ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" + ) + class TestCUDNN(TestConv3DTransposeOp): + def init_op_type(self): +@@ -481,7 +481,7 @@ class TestCUDNN(TestConv3DTransposeOp): + + + @unittest.skipIf( +- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ++ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" + ) + class TestCUDNNWithSymmetricPad(TestWithSymmetricPad): + def init_test_case(self): +@@ -500,7 +500,7 @@ class TestCUDNNWithSymmetricPad(TestWithSymmetricPad): + + + @unittest.skipIf( +- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ++ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" + ) + class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad): + def init_test_case(self): +@@ -519,7 +519,7 @@ class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad): + + + @unittest.skipIf( +- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ++ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" + ) + class TestCUDNNWithSAMEPad(TestWithSAMEPad): + def init_test_case(self): +@@ -538,7 +538,7 @@ class TestCUDNNWithSAMEPad(TestWithSAMEPad): + + + @unittest.skipIf( +- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ++ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" + ) + class TestCUDNNWithVALIDPad(TestWithVALIDPad): + def init_test_case(self): +@@ -557,7 +557,7 @@ class TestCUDNNWithVALIDPad(TestWithVALIDPad): + + + @unittest.skipIf( +- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ++ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" + ) + class TestCUDNNWithStride(TestWithStride): + def init_test_case(self): +@@ -576,7 +576,7 @@ class TestCUDNNWithStride(TestWithStride): + + + @unittest.skipIf( +- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ++ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" + ) + class TestCUDNNWithGroups(TestWithGroups): + def init_test_case(self): +@@ -610,7 +610,7 @@ class TestCUDNNWithGroups(TestWithGroups): + + + @unittest.skipIf( +- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ++ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" + ) + class TestCUDNN_NHWC(TestConv3DTransposeOp): + def init_test_case(self): +@@ -630,7 +630,7 @@ class TestCUDNN_NHWC(TestConv3DTransposeOp): + + + @unittest.skipIf( +- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ++ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" + ) + class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad): + def init_test_case(self): +@@ -650,7 +650,7 @@ class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad): + + + @unittest.skipIf( +- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ++ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" + ) + class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad): + def init_test_case(self): +@@ -670,7 +670,7 @@ class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad): + + + @unittest.skipIf( +- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ++ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" + ) + class TestCUDNNWithStride_NHWC(TestWithStride): + def init_test_case(self): +@@ -690,7 +690,7 @@ class TestCUDNNWithStride_NHWC(TestWithStride): + + + @unittest.skipIf( +- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ++ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" + ) + class TestCUDNNWithGroups_NHWC(TestWithGroups): + def init_test_case(self): +diff --git a/test/legacy_test/test_cross_entropy_op.py b/test/legacy_test/test_cross_entropy_op.py +index 74eedb6a48..e4c6ecb98a 100644 +--- a/test/legacy_test/test_cross_entropy_op.py ++++ b/test/legacy_test/test_cross_entropy_op.py +@@ -20,6 +20,8 @@ from op_test import ( + get_places, + paddle_static_guard, + randomize_probability, ++ is_custom_device, ++ get_device_place, + ) + + import paddle +@@ -385,19 +387,19 @@ class TestCrossEntropyOp7RemoveLastDim(TestCrossEntropyOp7): + # Add Fp16 test + def create_test_class(parent, cls_name): + @unittest.skipIf( +- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ++ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" + ) + class TestCrossEntropyFP16Op(parent): + def init_dtype_type(self): + return np.float16 + + def test_check_output(self): +- place = core.CUDAPlace(0) ++ place = get_device_place() + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-1) + + def test_check_grad(self): +- place = core.CUDAPlace(0) ++ place = get_device_place() + if core.is_float16_supported(place): + self.check_grad_with_place( + place, ['X'], 'Y', max_relative_error=0.9 +diff --git a/test/legacy_test/test_fmin_op.py b/test/legacy_test/test_fmin_op.py +index 4c9944e877..e6ed5c0f8e 100644 +--- a/test/legacy_test/test_fmin_op.py ++++ b/test/legacy_test/test_fmin_op.py +@@ -15,8 +15,7 @@ + import unittest + + import numpy as np +-from op_test import OpTest, convert_float_to_uint16 +- ++from op_test import OpTest, convert_float_to_uint16, is_custom_device, get_devices, get_device_place + import paddle + from paddle.base import core + +@@ -28,8 +27,8 @@ class ApiFMinTest(unittest.TestCase): + + def setUp(self): + """setUp""" +- if core.is_compiled_with_cuda(): +- self.place = core.CUDAPlace(0) ++ if core.is_compiled_with_cuda() or is_custom_device(): ++ self.place = get_device_place() + else: + self.place = core.CPUPlace() + +@@ -259,8 +258,8 @@ class TestElementwiseFmin3Op(OpTest): + + + @unittest.skipIf( +- not core.is_compiled_with_cuda() +- or not core.is_bfloat16_supported(core.CUDAPlace(0)), ++ not (core.is_compiled_with_cuda() or is_custom_device()) ++ or not core.is_bfloat16_supported(get_device_place()), + "core is not compiled with CUDA and not support the bfloat16", + ) + class TestFminBF16OP(OpTest): +@@ -281,13 +280,13 @@ class TestFminBF16OP(OpTest): + self.outputs = {'Out': convert_float_to_uint16(out)} + + def test_check_output(self): +- place = core.CUDAPlace(0) ++ place = get_device_place() + self.check_output_with_place( + place, check_pir=True, check_symbol_infer=False + ) + + def test_check_grad(self): +- place = core.CUDAPlace(0) ++ place = get_device_place() + self.check_grad_with_place( + place, ['X', 'Y'], 'Out', check_pir=True, check_prim_pir=True + ) +@@ -304,7 +303,7 @@ class TestElementwiseFminOpZeroSize1(TestElementwiseFminOp): + + + @unittest.skipIf( +- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ++ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" + ) + class TestElementwiseFminOp_Stride(OpTest): + no_need_check_grad = True +@@ -335,7 +334,7 @@ class TestElementwiseFminOp_Stride(OpTest): + self.val_dtype = np.float64 + + def test_check_output(self): +- place = core.CUDAPlace(0) ++ place = get_device_place() + self.check_strided_forward = True + self.check_output( + place, +diff --git a/test/legacy_test/test_spectral_norm_op.py b/test/legacy_test/test_spectral_norm_op.py +index 80e5c2ec63..f1602a8b40 100644 +--- a/test/legacy_test/test_spectral_norm_op.py ++++ b/test/legacy_test/test_spectral_norm_op.py +@@ -112,6 +112,7 @@ class TestSpectralNormOpNoGrad2(TestSpectralNormOpNoGrad): + + class TestSpectralNormOp(TestSpectralNormOpNoGrad): + def test_check_grad_ignore_uv(self): ++ + self.check_grad( + ['Weight'], + 'Out', diff --git a/third_party/flagcx b/third_party/flagcx index 77495cd6a8..7e6c4cc3ca 160000 --- a/third_party/flagcx From a1530d2b4a9837dc9975fff03fac774a45ea702d Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Tue, 9 Sep 2025 15:41:45 +0800 Subject: [PATCH 02/58] [metax]change_cupti_and_fix_softmax (#7) * [Metax_change_ut] * fix sum&collect_fpn_proposals op register * modify profile * [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel' * [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels * [Metax] con2d_grad use gpudnn * blas handle support * [Metax] register some kernels & update CMakeLists * [Metax] fix metax unittest fail * [Metax] add group_norm & label_smooth kernel and update matmul kernel * [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register * add test * add test * [test] chang the logic of workspace_host in cholesky_kernel_register alloc(cpuplace,size), test pass alloc(cpuplace, size, stream), crash * [Metax] fix compile fail * Revert "[Metax] fix compile fail" This reverts commit 83bc87f686227962b0262e044225c6ed5507b824. * [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h' * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] con2d_grad use gpudnn * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] change_patch * [Metax] update unit test CMakeLists.txt * [Metax] update unit test CMakeLists.txt * [feature] add unique_consecutive kernel * [metax] add some kernel * [metax] add some kernel * [Metax] register baddbmm kernel & update blas api * [Metax] register baddbmm kernel & update blas api * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [feature] add add unique_consecutive kernel.cu * [fix] fix some test case due to missing op register * [fix] fix some fail text * [metax]fix lu eigvalshsqueeze rnn kernel * [metax]fix lu eigvalshsqueeze rnn kernel * add and fix some kernels * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [Metax] fix conflict * [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure * [Metax] update repeat_interleave kernel & ignore max op test * [metax]fix lu eigvalshsqueeze rnn kernel * [metax] chang patch fix copy * [metax] chang patch fix copy * [Metax] update metax_gpu unit test * [Metax] fix test CMakeList.txt * [metax]change_cupti_and_fix_softmax * [metax]change_patch * [metax]change_patch --------- Co-authored-by: Mingkun.Zhang <2496808993@qq.com> Co-authored-by: metax666 Co-authored-by: jiaxinWang-metax <189149612@qq.com> Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Co-authored-by: chezhang <1376507468@qq.com> Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com> Co-authored-by: ZhouDuan <1184319564@qq.com> --- backends/metax_gpu/kernels/funcs/softmax.cu | 168 ++++++ .../cross_entropy_grad_kernel_register.cu | 10 +- backends/metax_gpu/patch/paddle.patch | 511 ++---------------- .../metax_gpu/runtime/process_cupti_data.cc | 136 +++-- 4 files changed, 309 insertions(+), 516 deletions(-) create mode 100644 backends/metax_gpu/kernels/funcs/softmax.cu diff --git a/backends/metax_gpu/kernels/funcs/softmax.cu b/backends/metax_gpu/kernels/funcs/softmax.cu new file mode 100644 index 00000000000..d738a53f43a --- /dev/null +++ b/backends/metax_gpu/kernels/funcs/softmax.cu @@ -0,0 +1,168 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include "kernels/metax_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_dnn.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/softmax.h" +#include "paddle/phi/kernels/funcs/softmax_impl.h" + +namespace phi { +namespace funcs { + +using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor; +using DataLayout = phi::backends::gpu::DataLayout; +template +using CudnnDataType = phi::backends::gpu::CudnnDataType; + +template +void SoftmaxCUDNNFunctor::operator()( + const DeviceContext& dev_ctx, + const phi::DenseTensor* X, + phi::DenseTensor* Y) { + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor xDesc; + ScopedTensorDescriptor yDesc; + std::vector cudnn_tensor_dims = common::vectorize(X->dims()); + DataLayout layout = DataLayout::kNCHW; + if (cudnn_tensor_dims.size() == 5) { + layout = DataLayout::kNCDHW; + } + // NOTE(*) : cudnn softmax only support >= 4D phi::DenseTensor, + // fill 1 at unused dims + if (cudnn_tensor_dims.size() <= 2) { + cudnn_tensor_dims.resize(4, 1); + } +#ifdef PADDLE_WITH_HIP + miopenTensorDescriptor_t cudnn_x_desc = + xDesc.descriptor(layout, cudnn_tensor_dims); + miopenTensorDescriptor_t cudnn_y_desc = + xDesc.descriptor(layout, cudnn_tensor_dims); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenSoftmaxForward_V2(dev_ctx.cudnn_handle(), + CudnnDataType::kOne(), + cudnn_x_desc, + X->data(), + CudnnDataType::kZero(), + cudnn_y_desc, + dev_ctx.template Alloc(Y), + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_INSTANCE)); +#else + cudnnTensorDescriptor_t cudnn_x_desc = + xDesc.descriptor(layout, cudnn_tensor_dims); + cudnnTensorDescriptor_t cudnn_y_desc = + xDesc.descriptor(layout, cudnn_tensor_dims); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxForward( + GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()), + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_INSTANCE, + CudnnDataType::kOne(), + cudnn_x_desc, + X->data(), + CudnnDataType::kZero(), + cudnn_y_desc, + dev_ctx.template Alloc(Y))); +#endif +} + +template +void SoftmaxGradCUDNNFunctor::operator()( + const DeviceContext& dev_ctx, + const phi::DenseTensor* Y, + const phi::DenseTensor* YGrad, + phi::DenseTensor* XGrad) { + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor yDesc; + ScopedTensorDescriptor dyDesc; + ScopedTensorDescriptor dxDesc; + std::vector cudnn_tensor_dims = common::vectorize(Y->dims()); + DataLayout layout = DataLayout::kNCHW; + if (cudnn_tensor_dims.size() == 5) { + layout = DataLayout::kNCDHW; + } + // NOTE(*) : cudnn softmax only support >= 4D phi::DenseTensor, + // fill 1 at unused dims + if (cudnn_tensor_dims.size() <= 2) { + cudnn_tensor_dims.resize(4, 1); + } +#ifdef PADDLE_WITH_HIP + miopenTensorDescriptor_t cudnn_y_desc = + yDesc.descriptor(layout, cudnn_tensor_dims); + miopenTensorDescriptor_t cudnn_xgrad_desc = + dxDesc.descriptor(layout, cudnn_tensor_dims); + miopenTensorDescriptor_t cudnn_ygrad_desc = + dyDesc.descriptor(layout, cudnn_tensor_dims); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSoftmaxBackward_V2( + GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()), + CudnnDataType::kOne(), + cudnn_y_desc, + Y->data(), + cudnn_ygrad_desc, + YGrad->data(), + CudnnDataType::kZero(), + cudnn_xgrad_desc, + dev_ctx.template Alloc(XGrad), + MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_INSTANCE)); +#else + cudnnTensorDescriptor_t cudnn_y_desc = + yDesc.descriptor(layout, cudnn_tensor_dims); + cudnnTensorDescriptor_t cudnn_xgrad_desc = + dxDesc.descriptor(layout, cudnn_tensor_dims); + cudnnTensorDescriptor_t cudnn_ygrad_desc = + dyDesc.descriptor(layout, cudnn_tensor_dims); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxBackward( + GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()), + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_INSTANCE, + CudnnDataType::kOne(), + cudnn_y_desc, + Y->data(), + cudnn_ygrad_desc, + YGrad->data(), + CudnnDataType::kZero(), + cudnn_xgrad_desc, + dev_ctx.template Alloc(XGrad))); +#endif +} + +template class SoftmaxCUDNNFunctor; +template class SoftmaxCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; +#if CUDNN_VERSION_MIN(8, 1, 0) +template class SoftmaxCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; +#endif + +// MIOPEN do not support double +#ifndef PADDLE_WITH_HIP +template class SoftmaxCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; +#endif + +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; + +} // namespace funcs +} // namespace phi diff --git a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_grad_kernel_register.cu index b5de9dd8f3c..402f69a9958 100644 --- a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_grad_kernel_register.cu @@ -149,11 +149,11 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx, int ignore_index, int axis, DenseTensor* logits_grad) { - PADDLE_ENFORCE_EQ( - dev_ctx.GetPlace().GetType(), - phi::AllocationType::GPU, - common::errors::Unavailable("softmax_with_cross_entropy operator's " - "CUDA kernel only runs on GPU device.")); + // PADDLE_ENFORCE_EQ( + // dev_ctx.GetPlace().GetType(), + // phi::AllocationType::GPU, + // common::errors::Unavailable("softmax_with_cross_entropy operator's " + // "CUDA kernel only runs on GPU device.")); const T* loss_grad_data = loss_grad.data(); DenseTensor* logit_grad = logits_grad; diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index 682cee35caf..1935217baa0 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -419,7 +419,7 @@ index d69eb67d6f..1d8b6e9375 100644 #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu -index bdfd7313af..546bd07d5e 100644 +index cb35feee32..64f5bd24ac 100644 --- a/paddle/phi/kernels/funcs/fc_functor.cu +++ b/paddle/phi/kernels/funcs/fc_functor.cu @@ -16,12 +16,12 @@ limitations under the License. */ @@ -438,7 +438,7 @@ index bdfd7313af..546bd07d5e 100644 #include "paddle/phi/kernels/matmul_kernel.h" diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu -index 1a9a9cfb85..08ebe4b8af 100644 +index e101224970..a52eb6096f 100644 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu @@ -15,11 +15,13 @@ limitations under the License. */ @@ -470,10 +470,10 @@ index 558d363b39..05da04b517 100644 #include "paddle/phi/kernels/funcs/scatter.cu.h" diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h -index dc7935423c..84896c2214 100644 +index e30d440ff3..3c74792690 100644 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h +++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h -@@ -32,11 +32,11 @@ limitations under the License. */ +@@ -30,11 +30,11 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/primitive/functor_primitives.h" @@ -487,7 +487,7 @@ index dc7935423c..84896c2214 100644 #endif #define MAX_NUM_THREADS 1024 -@@ -200,21 +200,56 @@ __device__ __forceinline__ void AddTo(Pair topk[], +@@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair topk[], for (int k = beam_size - 2; k >= 0; k--) { if (largest) { if (topk[k] < p) { @@ -549,7 +549,7 @@ index dc7935423c..84896c2214 100644 } template -@@ -243,24 +278,24 @@ __device__ __forceinline__ void GetTopK(Pair topk[], +@@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair topk[], template __device__ __forceinline__ void GetTopK(Pair topk[], const T* src, @@ -581,7 +581,7 @@ index dc7935423c..84896c2214 100644 } } } -@@ -287,7 +322,9 @@ __device__ __forceinline__ void ThreadGetTopK(Pair topk[], +@@ -283,7 +318,9 @@ __device__ __forceinline__ void ThreadGetTopK(Pair topk[], } else { for (int k = 0; k < MaxLength; k++) { if (k < MaxLength - (*beam)) { @@ -592,7 +592,7 @@ index dc7935423c..84896c2214 100644 } else { if (largest) { topk[k].set(-static_cast(INFINITY), -1); -@@ -297,8 +334,10 @@ __device__ __forceinline__ void ThreadGetTopK(Pair topk[], +@@ -293,8 +330,10 @@ __device__ __forceinline__ void ThreadGetTopK(Pair topk[], } } if (!(*is_empty)) { @@ -604,7 +604,7 @@ index dc7935423c..84896c2214 100644 } } -@@ -359,6 +398,8 @@ __device__ __forceinline__ void BlockReduce(Pair shared_max[], +@@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair shared_max[], shared_max[wid] = input_now; } __syncthreads(); @@ -613,7 +613,7 @@ index dc7935423c..84896c2214 100644 if (largest) { input_now = (tid < BlockSize / WARP_SIZE) ? shared_max[lane] -@@ -373,27 +414,32 @@ __device__ __forceinline__ void BlockReduce(Pair shared_max[], +@@ -369,27 +410,32 @@ __device__ __forceinline__ void BlockReduce(Pair shared_max[], if (lane == 0) shared_max[0] = input_now; } __syncthreads(); @@ -652,7 +652,7 @@ index dc7935423c..84896c2214 100644 break; } } -@@ -482,16 +528,17 @@ struct Bitfield { +@@ -478,16 +524,17 @@ struct Bitfield { int pos, int len) { unsigned int ret; @@ -674,7 +674,7 @@ index dc7935423c..84896c2214 100644 return ret; } }; -@@ -502,7 +549,9 @@ struct Bitfield { +@@ -498,7 +545,9 @@ struct Bitfield { int pos, int len) { uint64_t ret; @@ -685,7 +685,7 @@ index dc7935423c..84896c2214 100644 return ret; } -@@ -511,9 +560,9 @@ struct Bitfield { +@@ -507,9 +556,9 @@ struct Bitfield { int pos, int len) { uint64_t ret; @@ -698,7 +698,7 @@ index dc7935423c..84896c2214 100644 return ret; } }; -@@ -631,14 +680,20 @@ struct RadixTypeConfig { +@@ -627,14 +676,20 @@ struct RadixTypeConfig { /*---------------------------Helper Functions------------------*/ __device__ __forceinline__ int GetLaneId() { int lane_id; @@ -723,7 +723,7 @@ index dc7935423c..84896c2214 100644 } template -@@ -885,7 +940,8 @@ __global__ void GatherKthValue(const T* input, +@@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input, // 1. Find the k-th value T kth_value = static_cast(0); @@ -733,13 +733,13 @@ index dc7935423c..84896c2214 100644 cur_input, k, num_cols, shared_mem, &kth_value); __shared__ int64_t block_min_idx; -@@ -1318,3 +1374,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx, +@@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx, } } // namespace funcs } // namespace phi +// diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h -index 45a29b4cff..8449e3d309 100644 +index 32db61532f..0220316bc3 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h @@ -15,7 +15,7 @@ @@ -752,7 +752,7 @@ index 45a29b4cff..8449e3d309 100644 #include "glog/logging.h" diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h -index 7d05bcb654..c79cdadabc 100644 +index 9d4bb18d55..ea42cc10a9 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h @@ -638,9 +638,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel( @@ -767,11 +767,11 @@ index 7d05bcb654..c79cdadabc 100644 } } diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu -index ad04265bd6..59481d0e6a 100644 +index b8cfdbf3ce..fa14b94a77 100644 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu -@@ -15,7 +15,7 @@ - #include "paddle/phi/common/bfloat16.h" +@@ -14,7 +14,7 @@ + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h" @@ -780,11 +780,11 @@ index ad04265bd6..59481d0e6a 100644 namespace phi { namespace fusion { diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu -index 148d72ca9c..5da3461ebf 100644 +index e838778952..83e805e75a 100644 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu -@@ -15,7 +15,7 @@ - #include "paddle/phi/common/bfloat16.h" +@@ -14,7 +14,7 @@ + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h" @@ -793,7 +793,7 @@ index 148d72ca9c..5da3461ebf 100644 namespace phi { namespace fusion { diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h -index b16553589a..90080c375d 100644 +index f0cca0f701..02ea957240 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv.h +++ b/paddle/phi/kernels/gpu/depthwise_conv.h @@ -29,8 +29,8 @@ namespace cub = hipcub; @@ -833,7 +833,7 @@ index 29fa252e96..4ae72b0935 100644 } diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu -index ee71a2b452..69130ab955 100644 +index 11efd87965..679db14c24 100644 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu @@ -17,7 +17,7 @@ @@ -846,7 +846,7 @@ index ee71a2b452..69130ab955 100644 namespace phi { diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu -index 00a2f1e210..1267cf7ec2 100644 +index 63c35dd4ee..15da9aea45 100644 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu +++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu @@ -17,7 +17,7 @@ @@ -872,7 +872,7 @@ index 1bdbe1564c..f753b54bc6 100644 #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" #include "paddle/phi/kernels/lstsq_kernel.h" diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h -index 14b24dd3ed..e54a342c98 100644 +index 9bc5326c90..79b57a8203 100644 --- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h @@ -21,7 +21,7 @@ limitations under the License. */ @@ -885,7 +885,7 @@ index 14b24dd3ed..e54a342c98 100644 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/funcs/for_range.h" diff --git a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h -index 06fff0dd58..973049105f 100644 +index cf80666b4e..ca76e055fb 100644 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h @@ -19,7 +19,7 @@ limitations under the License. */ @@ -1041,461 +1041,12 @@ index 4099d8b506..baef2cd643 100644 #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" -diff --git a/test/legacy_test/test_batch_norm_op.py b/test/legacy_test/test_batch_norm_op.py -index 4a5660ea0e..ca4e456e02 100644 ---- a/test/legacy_test/test_batch_norm_op.py -+++ b/test/legacy_test/test_batch_norm_op.py -@@ -22,7 +22,9 @@ from op_test import ( - _set_use_system_allocator, - convert_float_to_uint16, - convert_uint16_to_float, -- get_places, -+ get_devices, -+ is_custom_device, -+ get_device_place, - ) - - import paddle -@@ -189,6 +191,7 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format): - - - def create_or_get_tensor(scope, var_name, var, place): -+ - tensor = scope.var(var_name).get_tensor() - if var is not None: - assert isinstance(var, np.ndarray) -@@ -321,7 +324,6 @@ class TestBatchNormOpInference(unittest.TestCase): - fuse_with_relu=self.fuse_with_relu, - epsilon=epsilon, - ) -- - batch_norm_op.run(scope, place) - - # When op is called without Executor then -@@ -454,7 +456,7 @@ class TestBatchNormOpInference(unittest.TestCase): - ) - - def test_check_output(self): -- for place in get_places(): -+ for place in get_devices(): - for data_format in ["NCHW", "NHWC"]: - self.check_with_place( - place, -@@ -488,8 +490,8 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference): - - def test_check_output(self): - places = [] -- if core.is_compiled_with_cuda(): -- place = core.CUDAPlace(0) -+ if core.is_compiled_with_cuda() or is_custom_device(): -+ place = get_device_place() - if core.is_float16_supported(place): - places.append(place) - for place in places: -@@ -510,8 +512,8 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference): - - - @unittest.skipIf( -- not core.is_compiled_with_cuda() -- or not core.is_bfloat16_supported(core.CUDAPlace(0)), -+ not (core.is_compiled_with_cuda() or is_custom_device()) -+ or not core.is_bfloat16_supported(get_device_place()), - "core is not compiled with CUDA or not support the bfloat16", - ) - class TestBF16BatchNormOpInference(TestBatchNormOpInference): -@@ -522,7 +524,7 @@ class TestBF16BatchNormOpInference(TestBatchNormOpInference): - self.init_kernel_type() - - def test_check_output(self): -- places = [core.CUDAPlace(0)] -+ places = [get_device_place()] - for place in places: - # for data_format in ["NCHW", "NHWC"]: - for data_format in ["NCHW"]: -@@ -562,7 +564,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase): - - class TestDygraphBatchNormTrainableStats(unittest.TestCase): - def test_dygraph(self): -- for p in get_places(): -+ for p in get_devices(): - shape = [4, 10, 4, 4] - - def compute(x, is_test, trainable_statistics): -@@ -581,7 +583,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase): - np.testing.assert_allclose(y1, y2, rtol=1e-05) - - def test_static(self): -- for p in get_places(): -+ for p in get_devices(): - exe = base.Executor(p) - shape = [4, 10, 16, 16] - -@@ -625,7 +627,7 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase): - - class TestBatchNormAPI_ZeroSize(unittest.TestCase): - def setUp(self): -- self.places = get_places() -+ self.places = get_devices() - - def test_dygraph(self): - for place in self.places: -diff --git a/test/legacy_test/test_conv3d_transpose_op.py b/test/legacy_test/test_conv3d_transpose_op.py -index c9853e9073..277eb26d00 100644 ---- a/test/legacy_test/test_conv3d_transpose_op.py -+++ b/test/legacy_test/test_conv3d_transpose_op.py -@@ -19,7 +19,7 @@ import numpy as np - import paddle - - paddle.enable_static() --from op_test import OpTest, copy_bits_from_float_to_uint16 -+from op_test import OpTest, copy_bits_from_float_to_uint16, is_custom_device, get_devices, get_device_place - - from paddle.base import core - -@@ -150,7 +150,7 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs): - - def create_test_cudnn_fp16_class(parent, grad_check=True): - @unittest.skipIf( -- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -+ not ((core.is_compiled_with_cuda() or is_custom_device()) or is_custom_device()), "core is not compiled with CUDA" - ) - class TestConv3DTransposeCUDNNFP16(parent): - def init_kernel_type(self): -@@ -158,20 +158,20 @@ def create_test_cudnn_fp16_class(parent, grad_check=True): - self.dtype = np.float16 - - def test_check_output(self): -- if core.is_compiled_with_cuda(): -- place = core.CUDAPlace(0) -+ if ((core.is_compiled_with_cuda() or is_custom_device()) or is_custom_device()): -+ place = get_device_place() - if core.is_float16_supported(place): - self.check_output_with_place(place, atol=2e-2) - - def test_check_grad_no_filter(self): -- place = core.CUDAPlace(0) -+ place = get_device_place() - if core.is_float16_supported(place) and grad_check: - self.check_grad_with_place( - place, ['Input'], 'Output', no_grad_set={'Filter'} - ) - - def test_check_grad_no_input(self): -- place = core.CUDAPlace(0) -+ place = get_device_place() - if core.is_float16_supported(place) and grad_check: - self.check_grad_with_place( - place, ['Filter'], 'Output', no_grad_set={'Input'} -@@ -184,8 +184,8 @@ def create_test_cudnn_fp16_class(parent, grad_check=True): - - def create_test_cudnn_bf16_class(parent): - @unittest.skipIf( -- not core.is_compiled_with_cuda() -- or not core.is_bfloat16_supported(core.CUDAPlace(0)), -+ not (core.is_compiled_with_cuda() or is_custom_device()) -+ or not core.is_bfloat16_supported(get_device_place()), - "core is not compiled with CUDA and do not support bfloat16", - ) - class TestConv3DTransposeCUDNNBF16(parent): -@@ -194,11 +194,11 @@ def create_test_cudnn_bf16_class(parent): - self.dtype = np.uint16 - - def test_check_output(self): -- place = core.CUDAPlace(0) -+ place = get_device_place() - self.check_output_with_place(place) - - def test_check_grad(self): -- place = core.CUDAPlace(0) -+ place = get_device_place() - self.check_grad_with_place( - place, - {'Input', 'Filter'}, -@@ -206,7 +206,7 @@ def create_test_cudnn_bf16_class(parent): - ) - - def test_check_grad_no_filter(self): -- place = core.CUDAPlace(0) -+ place = get_device_place() - self.check_grad_with_place( - place, - ['Input'], -@@ -215,7 +215,7 @@ def create_test_cudnn_bf16_class(parent): - ) - - def test_check_grad_no_input(self): -- place = core.CUDAPlace(0) -+ place = get_device_place() - self.check_grad_with_place( - place, - ['Filter'], -@@ -306,14 +306,14 @@ class TestConv3DTransposeOp(OpTest): - - def test_check_output(self): - if self.use_cudnn: -- place = core.CUDAPlace(0) -+ place = get_device_place() - self.check_output_with_place(place, atol=1e-5) - else: - self.check_output() - - def test_check_grad(self): - if self.use_cudnn: -- place = core.CUDAPlace(0) -+ place = get_device_place() - self.check_grad_with_place( - place, - {'Input', 'Filter'}, -@@ -327,7 +327,7 @@ class TestConv3DTransposeOp(OpTest): - - def test_check_grad_no_filter(self): - if self.use_cudnn: -- place = core.CUDAPlace(0) -+ place = get_device_place() - self.check_grad_with_place( - place, - ['Input'], -@@ -345,7 +345,7 @@ class TestConv3DTransposeOp(OpTest): - - def test_check_grad_no_input(self): - if self.use_cudnn: -- place = core.CUDAPlace(0) -+ place = get_device_place() - self.check_grad_with_place( - place, - ['Filter'], -@@ -471,7 +471,7 @@ class Test_NHWC(TestConv3DTransposeOp): - - # ------------ test_cudnn ------------ - @unittest.skipIf( -- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -+ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" - ) - class TestCUDNN(TestConv3DTransposeOp): - def init_op_type(self): -@@ -481,7 +481,7 @@ class TestCUDNN(TestConv3DTransposeOp): - - - @unittest.skipIf( -- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -+ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" - ) - class TestCUDNNWithSymmetricPad(TestWithSymmetricPad): - def init_test_case(self): -@@ -500,7 +500,7 @@ class TestCUDNNWithSymmetricPad(TestWithSymmetricPad): - - - @unittest.skipIf( -- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -+ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" - ) - class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad): - def init_test_case(self): -@@ -519,7 +519,7 @@ class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad): - - - @unittest.skipIf( -- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -+ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" - ) - class TestCUDNNWithSAMEPad(TestWithSAMEPad): - def init_test_case(self): -@@ -538,7 +538,7 @@ class TestCUDNNWithSAMEPad(TestWithSAMEPad): - - - @unittest.skipIf( -- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -+ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" - ) - class TestCUDNNWithVALIDPad(TestWithVALIDPad): - def init_test_case(self): -@@ -557,7 +557,7 @@ class TestCUDNNWithVALIDPad(TestWithVALIDPad): - - - @unittest.skipIf( -- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -+ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" - ) - class TestCUDNNWithStride(TestWithStride): - def init_test_case(self): -@@ -576,7 +576,7 @@ class TestCUDNNWithStride(TestWithStride): - - - @unittest.skipIf( -- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -+ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" - ) - class TestCUDNNWithGroups(TestWithGroups): - def init_test_case(self): -@@ -610,7 +610,7 @@ class TestCUDNNWithGroups(TestWithGroups): - - - @unittest.skipIf( -- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -+ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" - ) - class TestCUDNN_NHWC(TestConv3DTransposeOp): - def init_test_case(self): -@@ -630,7 +630,7 @@ class TestCUDNN_NHWC(TestConv3DTransposeOp): - - - @unittest.skipIf( -- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -+ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" - ) - class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad): - def init_test_case(self): -@@ -650,7 +650,7 @@ class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad): - - - @unittest.skipIf( -- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -+ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" - ) - class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad): - def init_test_case(self): -@@ -670,7 +670,7 @@ class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad): - - - @unittest.skipIf( -- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -+ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" - ) - class TestCUDNNWithStride_NHWC(TestWithStride): - def init_test_case(self): -@@ -690,7 +690,7 @@ class TestCUDNNWithStride_NHWC(TestWithStride): - - - @unittest.skipIf( -- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -+ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" - ) - class TestCUDNNWithGroups_NHWC(TestWithGroups): - def init_test_case(self): -diff --git a/test/legacy_test/test_cross_entropy_op.py b/test/legacy_test/test_cross_entropy_op.py -index 74eedb6a48..e4c6ecb98a 100644 ---- a/test/legacy_test/test_cross_entropy_op.py -+++ b/test/legacy_test/test_cross_entropy_op.py -@@ -20,6 +20,8 @@ from op_test import ( - get_places, - paddle_static_guard, - randomize_probability, -+ is_custom_device, -+ get_device_place, - ) - - import paddle -@@ -385,19 +387,19 @@ class TestCrossEntropyOp7RemoveLastDim(TestCrossEntropyOp7): - # Add Fp16 test - def create_test_class(parent, cls_name): - @unittest.skipIf( -- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -+ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" - ) - class TestCrossEntropyFP16Op(parent): - def init_dtype_type(self): - return np.float16 - - def test_check_output(self): -- place = core.CUDAPlace(0) -+ place = get_device_place() - if core.is_float16_supported(place): - self.check_output_with_place(place, atol=2e-1) - - def test_check_grad(self): -- place = core.CUDAPlace(0) -+ place = get_device_place() - if core.is_float16_supported(place): - self.check_grad_with_place( - place, ['X'], 'Y', max_relative_error=0.9 -diff --git a/test/legacy_test/test_fmin_op.py b/test/legacy_test/test_fmin_op.py -index 4c9944e877..e6ed5c0f8e 100644 ---- a/test/legacy_test/test_fmin_op.py -+++ b/test/legacy_test/test_fmin_op.py -@@ -15,8 +15,7 @@ - import unittest - - import numpy as np --from op_test import OpTest, convert_float_to_uint16 -- -+from op_test import OpTest, convert_float_to_uint16, is_custom_device, get_devices, get_device_place - import paddle - from paddle.base import core - -@@ -28,8 +27,8 @@ class ApiFMinTest(unittest.TestCase): - - def setUp(self): - """setUp""" -- if core.is_compiled_with_cuda(): -- self.place = core.CUDAPlace(0) -+ if core.is_compiled_with_cuda() or is_custom_device(): -+ self.place = get_device_place() - else: - self.place = core.CPUPlace() - -@@ -259,8 +258,8 @@ class TestElementwiseFmin3Op(OpTest): - - - @unittest.skipIf( -- not core.is_compiled_with_cuda() -- or not core.is_bfloat16_supported(core.CUDAPlace(0)), -+ not (core.is_compiled_with_cuda() or is_custom_device()) -+ or not core.is_bfloat16_supported(get_device_place()), - "core is not compiled with CUDA and not support the bfloat16", - ) - class TestFminBF16OP(OpTest): -@@ -281,13 +280,13 @@ class TestFminBF16OP(OpTest): - self.outputs = {'Out': convert_float_to_uint16(out)} - - def test_check_output(self): -- place = core.CUDAPlace(0) -+ place = get_device_place() - self.check_output_with_place( - place, check_pir=True, check_symbol_infer=False - ) - - def test_check_grad(self): -- place = core.CUDAPlace(0) -+ place = get_device_place() - self.check_grad_with_place( - place, ['X', 'Y'], 'Out', check_pir=True, check_prim_pir=True - ) -@@ -304,7 +303,7 @@ class TestElementwiseFminOpZeroSize1(TestElementwiseFminOp): - - - @unittest.skipIf( -- not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -+ not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA" - ) - class TestElementwiseFminOp_Stride(OpTest): - no_need_check_grad = True -@@ -335,7 +334,7 @@ class TestElementwiseFminOp_Stride(OpTest): - self.val_dtype = np.float64 - - def test_check_output(self): -- place = core.CUDAPlace(0) -+ place = get_device_place() - self.check_strided_forward = True - self.check_output( - place, -diff --git a/test/legacy_test/test_spectral_norm_op.py b/test/legacy_test/test_spectral_norm_op.py -index 80e5c2ec63..f1602a8b40 100644 ---- a/test/legacy_test/test_spectral_norm_op.py -+++ b/test/legacy_test/test_spectral_norm_op.py -@@ -112,6 +112,7 @@ class TestSpectralNormOpNoGrad2(TestSpectralNormOpNoGrad): - - class TestSpectralNormOp(TestSpectralNormOpNoGrad): - def test_check_grad_ignore_uv(self): -+ - self.check_grad( - ['Weight'], - 'Out', diff --git a/third_party/flagcx b/third_party/flagcx -index 77495cd6a8..7e6c4cc3ca 160000 +index 7c469f4af9..7e6c4cc3ca 160000 --- a/third_party/flagcx +++ b/third_party/flagcx @@ -1 +1 @@ --Subproject commit 77495cd6a84b1c8f88dd8f6f99e63ef3c84c766f +-Subproject commit 7c469f4af991bf0f64b8f76d66f8e307a5eaea3f +Subproject commit 7e6c4cc3cad3fce9b3dedfe46a9d195d616e8ffa diff --git a/third_party/flashattn b/third_party/flashattn index 581e48aa69..749aca3807 160000 diff --git a/backends/metax_gpu/runtime/process_cupti_data.cc b/backends/metax_gpu/runtime/process_cupti_data.cc index 65011e3f58d..94caca5d8cb 100755 --- a/backends/metax_gpu/runtime/process_cupti_data.cc +++ b/backends/metax_gpu/runtime/process_cupti_data.cc @@ -226,52 +226,126 @@ class CuptiRuntimeCbidStr { CuptiRuntimeCbidStr::CuptiRuntimeCbidStr() { #define REGISTER_RUNTIME_CBID_STR(cbid) \ cbid_str_[CUPTI_RUNTIME_TRACE_CBID_##cbid] = #cbid - REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020); - REGISTER_RUNTIME_CBID_STR(cudaConfigureCall_v3020); - REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000); - REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050); - REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020); REGISTER_RUNTIME_CBID_STR(cudaDriverGetVersion_v3020); - REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020); - REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020); - REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020); - REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020); - REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020); - REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020); - REGISTER_RUNTIME_CBID_STR(cudaFree_v3020); - REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020); + REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020); REGISTER_RUNTIME_CBID_STR(cudaGetDeviceCount_v3020); REGISTER_RUNTIME_CBID_STR(cudaGetDeviceProperties_v3020); - REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020); - REGISTER_RUNTIME_CBID_STR(cudaGetErrorString_v3020); + REGISTER_RUNTIME_CBID_STR(cudaChooseDevice_v3020); REGISTER_RUNTIME_CBID_STR(cudaGetLastError_v3020); + REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020); + REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020); + REGISTER_RUNTIME_CBID_STR(cudaFuncSetCacheConfig_v3020); + REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020); + REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020); + REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020); + REGISTER_RUNTIME_CBID_STR(cudaSetValidDevices_v3020); + REGISTER_RUNTIME_CBID_STR(cudaSetDeviceFlags_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMallocPitch_v3020); + REGISTER_RUNTIME_CBID_STR(cudaFree_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMallocArray_v3020); + REGISTER_RUNTIME_CBID_STR(cudaFreeArray_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020); + REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020); REGISTER_RUNTIME_CBID_STR(cudaHostAlloc_v3020); REGISTER_RUNTIME_CBID_STR(cudaHostGetDevicePointer_v3020); - REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000); - REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020); - REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020); - REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020); + REGISTER_RUNTIME_CBID_STR(cudaHostGetFlags_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemGetInfo_v3020); REGISTER_RUNTIME_CBID_STR(cudaMemcpy_v3020); - REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemcpy2D_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemcpyToArray_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemcpy2DToArray_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemcpyToSymbol_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemcpyFromSymbol_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemcpy2DAsync_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemcpyToSymbolAsync_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemcpyFromSymbolAsync_v3020); REGISTER_RUNTIME_CBID_STR(cudaMemset_v3020); - REGISTER_RUNTIME_CBID_STR( - cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000); - REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020); - REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020); - REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemset2D_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemset2DAsync_v3020); + REGISTER_RUNTIME_CBID_STR(cudaGetSymbolAddress_v3020); + REGISTER_RUNTIME_CBID_STR(cudaGetSymbolSize_v3020); + REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020); + REGISTER_RUNTIME_CBID_STR(cudaBindTexture2D_v3020); + REGISTER_RUNTIME_CBID_STR(cudaBindTextureToArray_v3020); + REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020); REGISTER_RUNTIME_CBID_STR(cudaStreamCreate_v3020); - REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000); - REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050); - REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050); + REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v3020); REGISTER_RUNTIME_CBID_STR(cudaStreamSynchronize_v3020); + REGISTER_RUNTIME_CBID_STR(cudaStreamQuery_v3020); + REGISTER_RUNTIME_CBID_STR(cudaEventCreate_v3020); + REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020); + REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020); + REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020); + REGISTER_RUNTIME_CBID_STR(cudaEventSynchronize_v3020); + REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020); + REGISTER_RUNTIME_CBID_STR(cudaEventElapsedTime_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMalloc3D_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMalloc3DArray_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemset3D_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemset3DAsync_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemcpy3D_v3020); + REGISTER_RUNTIME_CBID_STR(cudaMemcpy3DAsync_v3020); REGISTER_RUNTIME_CBID_STR(cudaStreamWaitEvent_v3020); - REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020); - REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020); - REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020); + REGISTER_RUNTIME_CBID_STR(cudaPointerGetAttributes_v4000); + REGISTER_RUNTIME_CBID_STR(cudaHostRegister_v4000); + REGISTER_RUNTIME_CBID_STR(cudaHostUnregister_v4000); + REGISTER_RUNTIME_CBID_STR(cudaDeviceCanAccessPeer_v4000); + REGISTER_RUNTIME_CBID_STR(cudaDeviceEnablePeerAccess_v4000); + REGISTER_RUNTIME_CBID_STR(cudaDeviceDisablePeerAccess_v4000); + REGISTER_RUNTIME_CBID_STR(cudaMemcpyPeer_v4000); + REGISTER_RUNTIME_CBID_STR(cudaMemcpyPeerAsync_v4000); + REGISTER_RUNTIME_CBID_STR(cudaMemcpy3DPeer_v4000); + REGISTER_RUNTIME_CBID_STR(cudaMemcpy3DPeerAsync_v4000); + REGISTER_RUNTIME_CBID_STR(cudaDeviceReset_v3020); + REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020); + REGISTER_RUNTIME_CBID_STR(cudaDeviceGetLimit_v3020); + REGISTER_RUNTIME_CBID_STR(cudaDeviceSetLimit_v3020); + REGISTER_RUNTIME_CBID_STR(cudaDeviceGetCacheConfig_v3020); + REGISTER_RUNTIME_CBID_STR(cudaDeviceSetCacheConfig_v3020); + REGISTER_RUNTIME_CBID_STR(cudaProfilerInitialize_v4000); + REGISTER_RUNTIME_CBID_STR(cudaProfilerStart_v4000); + REGISTER_RUNTIME_CBID_STR(cudaProfilerStop_v4000); + REGISTER_RUNTIME_CBID_STR(cudaDeviceGetByPCIBusId_v4010); REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010); + REGISTER_RUNTIME_CBID_STR(cudaIpcGetEventHandle_v4010); + REGISTER_RUNTIME_CBID_STR(cudaIpcOpenEventHandle_v4010); + REGISTER_RUNTIME_CBID_STR(cudaIpcGetMemHandle_v4010); + REGISTER_RUNTIME_CBID_STR(cudaIpcOpenMemHandle_v4010); + REGISTER_RUNTIME_CBID_STR(cudaIpcCloseMemHandle_v4010); + REGISTER_RUNTIME_CBID_STR(cudaFuncSetSharedMemConfig_v4020); + REGISTER_RUNTIME_CBID_STR(cudaDeviceGetSharedMemConfig_v4020); + REGISTER_RUNTIME_CBID_STR(cudaDeviceSetSharedMemConfig_v4020); + REGISTER_RUNTIME_CBID_STR(cudaStreamAddCallback_v5000); + REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000); + REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000); + REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050); + REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050); + REGISTER_RUNTIME_CBID_STR(cudaStreamGetPriority_v5050); + REGISTER_RUNTIME_CBID_STR(cudaStreamGetFlags_v5050); + REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050); + REGISTER_RUNTIME_CBID_STR(cudaMallocManaged_v6000); + REGISTER_RUNTIME_CBID_STR( + cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000); + REGISTER_RUNTIME_CBID_STR(cudaStreamAttachMemAsync_v6000); + REGISTER_RUNTIME_CBID_STR( + cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050); + REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000); + REGISTER_RUNTIME_CBID_STR(cudaGetDeviceFlags_v7000); + REGISTER_RUNTIME_CBID_STR( + cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000); + REGISTER_RUNTIME_CBID_STR(cudaMemRangeGetAttribute_v8000); + REGISTER_RUNTIME_CBID_STR(cudaMemRangeGetAttributes_v8000); #if CUDA_VERSION >= 9000 REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000); REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000); + REGISTER_RUNTIME_CBID_STR(cudaFuncSetAttribute_v9000); + REGISTER_RUNTIME_CBID_STR(cudaGraphLaunch_v10000); + REGISTER_RUNTIME_CBID_STR(cudaStreamSetAttribute_v11000); + REGISTER_RUNTIME_CBID_STR(cudaMallocAsync_v11020); + REGISTER_RUNTIME_CBID_STR(cudaFreeAsync_v11020); #endif #undef REGISTER_RUNTIME_CBID_STR } From 352f02e869be9bccd1c9d154d2c70151626a43ea Mon Sep 17 00:00:00 2001 From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Date: Tue, 9 Sep 2025 16:45:38 +0800 Subject: [PATCH 03/58] [Metax] fix dgc & mklml compile product path problem (#8) --- backends/metax_gpu/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index 5022e1bdde3..beb442eadad 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -26,6 +26,10 @@ set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}") set(WITH_MKLML ON) +set(THIRD_PARTY_PATH + "${PADDLE_SOURCE_DIR}/build/third_party" + CACHE PATH "Third party libraries directory.") + include(paddle) include(version) include(generic) @@ -52,10 +56,6 @@ option(ON_INFER "compile with inference c++ lib" OFF) option(WITH_GPU "Compile PaddlePaddle with METAX_GPU" ON) option(WITH_CUSTOM_DEVICE "Compile PaddlePaddle with CUSTOM_DEVICE" ON) -set(THIRD_PARTY_PATH - "${PADDLE_SOURCE_DIR}/build/third_party" - CACHE PATH "Third party libraries directory.") - macro(UNSET_VAR VAR_NAME) unset(${VAR_NAME} CACHE) unset(${VAR_NAME}) From 8f13faed41890653f7f57328674c672c77dcfa4c Mon Sep 17 00:00:00 2001 From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Date: Thu, 11 Sep 2025 17:18:33 +0800 Subject: [PATCH 04/58] [Metax] fix accuracy kernel & add test_accuracy_op_metax.py unit test (#9) * [Metax] fix dgc & mklml compile product path problem * [Metax] fix accuracy kernel & add test_accuracy_op_metax.py unit test * [Metax] add mixed_vector fix & update change patch --- backends/metax_gpu/CMakeLists.txt | 2 +- backends/metax_gpu/build.sh | 26 +- backends/metax_gpu/build_in_metax.sh | 17 +- backends/metax_gpu/change_patch.sh | 9 +- .../cuda_kernels/accuracy_kernel_register.cu | 141 ++- backends/metax_gpu/patch/tmp/mixed_vector.cc | 111 ++ backends/metax_gpu/patch/tmp/mixed_vector.h | 413 ++++++++ .../tests/unittest/test_accuracy_op_metax.py | 206 ++++ .../tests/unittest/test_gather_op_metax.py | 983 +++++++++++++++--- 9 files changed, 1740 insertions(+), 168 deletions(-) create mode 100644 backends/metax_gpu/patch/tmp/mixed_vector.cc create mode 100644 backends/metax_gpu/patch/tmp/mixed_vector.h create mode 100644 backends/metax_gpu/tests/unittest/test_accuracy_op_metax.py diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index beb442eadad..4567723123c 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -128,7 +128,7 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/arange_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/adadelta_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/accuracy_check_kernel.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/accuracy_kernel.cu + # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/accuracy_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/allclose_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/all_gather_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/all_reduce_kernel.cu diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh index 0350a32521f..dd0ab3aab90 100755 --- a/backends/metax_gpu/build.sh +++ b/backends/metax_gpu/build.sh @@ -2,13 +2,13 @@ #!/bin/bash # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -31,25 +31,7 @@ git submodule sync --recursive && git submodule update --init --recursive # apply patch - -rm -r ../../Paddle/third_party/eigen3 - - -cd patch - -unzip mcEigen_3.4.0_paddle_final.zip - -mv mcEigen_3.4.0_paddle_final eigen3 - -cd .. - -cp -r patch/eigen3/ ../../Paddle/third_party/eigen3 - -cd ../../Paddle/ - -git apply --verbose ../backends/metax_gpu/patch/paddle.patch - -cd - +bash change_patch.sh export MACA_PATH=/opt/maca diff --git a/backends/metax_gpu/build_in_metax.sh b/backends/metax_gpu/build_in_metax.sh index b1f9d63d85c..67ec1a2c31c 100644 --- a/backends/metax_gpu/build_in_metax.sh +++ b/backends/metax_gpu/build_in_metax.sh @@ -2,13 +2,13 @@ #!/bin/bash # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,16 +22,7 @@ git submodule sync --recursive && git submodule update --init --recursive # apply patch - -rm -r ../../Paddle/third_party/eigen3 -cd patch -unzip mcEigen_3.4.0_paddle_final.zip -mv mcEigen_3.4.0_paddle_final eigen3 -cd .. -cp -r patch/eigen3/ ../../Paddle/third_party/eigen3 -cd ../../Paddle/ -git apply --verbose ../backends/metax_gpu/patch/paddle.patch -cd - +bash change_patch.sh export MACA_PATH=/opt/maca export CUDA_PATH=/workspace/cuda-11.7/ diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh index 58bda1aacd4..833ae00f6bd 100644 --- a/backends/metax_gpu/change_patch.sh +++ b/backends/metax_gpu/change_patch.sh @@ -2,13 +2,13 @@ #!/bin/bash # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,11 +16,12 @@ # limitations under the License. rm -r ../../Paddle/third_party/eigen3 -cd patch +cd patch unzip mcEigen_3.4.0_paddle_final.zip mv mcEigen_3.4.0_paddle_final eigen3 cd .. cp -r patch/eigen3/ ../../Paddle/third_party/eigen3 +cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core cd ../../Paddle/ git apply --verbose ../backends/metax_gpu/patch/paddle.patch cd - diff --git a/backends/metax_gpu/kernels/cuda_kernels/accuracy_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/accuracy_kernel_register.cu index 1b26e5711ac..0d61c79d0fa 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/accuracy_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/accuracy_kernel_register.cu @@ -1,7 +1,7 @@ // 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights // Reserved. -// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,19 +14,150 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/accuracy_kernel.h" +namespace phi { +using phi::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void AccuracyCudaKernel(const int N, + const int D, + const int64_t* Xdata, + const int64_t* labeldata, + int* correct_data, + T* accuracy, + int* total_data) { + using MT = typename phi::dtype::MPTypeTrait::Type; + int count = 0; + __shared__ int total[BlockSize]; + + // support only 1 block + for (int i = threadIdx.x; i < (N); i += BlockSize) { + for (int j = 0; j < D; ++j) { + if (Xdata[i * D + j] == labeldata[i]) { + ++count; + break; + } + } + } + total[threadIdx.x] = count; + __syncthreads(); + + // reduce the count with init value 0, and output accuracy. + // #ifdef PADDLE_WITH_CUDA + // int result = thrust::reduce(thrust::device, total, total + BlockSize, 0); + // #else + // HIP thrust::reduce not support __device__ + for (int s = BlockSize / 2; s > 0; s >>= 1) { + if (threadIdx.x < s) { + total[threadIdx.x] += total[threadIdx.x + s]; + } + __syncthreads(); + } + int result = total[0]; + // #endif + if (threadIdx.x == 0) { + *correct_data = result; + *accuracy = static_cast(static_cast(result) / static_cast(N)); + *total_data = N; + } +} + +template +void AccuracyKernel(const Context& dev_ctx, + const DenseTensor& inference, + const DenseTensor& indices, + const DenseTensor& label, + DenseTensor* accuracy, + DenseTensor* correct, + DenseTensor* total) { + // FIXME(typhoonzero): only support indices currently + // if add support for output values, how to detect the data type? + const int64_t* indices_data = indices.data(); + const int64_t* label_data = label.data(); + + PADDLE_ENFORCE_EQ( + inference.dims().size(), + 2, + common::errors::InvalidArgument( + "Rank(Input) of AccuracyOp must be 2, with shape " + "[sample_number, class_dim], But received rank(Input) is %d", + inference.dims().size())); + + int* correct_data = dev_ctx.template Alloc(correct); + int* total_data = dev_ctx.template Alloc(total); + T* accuracy_data = dev_ctx.template Alloc(accuracy); + + int num_samples = static_cast(inference.dims()[0]); + size_t infer_width = inference.dims()[1]; + auto stream = dev_ctx.stream(); + phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(T), stream); + + PADDLE_ENFORCE_GT(label.dims().size(), + 0, + common::errors::InvalidArgument( + "Rank(Label) of AccuracyOp must greater than 0, " + "But received rank(Label) is %d", + label.dims().size())); + + PADDLE_ENFORCE_GE(label.dims()[0], + inference.dims()[0], + common::errors::InvalidArgument( + "num_samples(%d) of Label should less than " + "or equal to num_samples(%d) of Input", + label.dims()[0], + num_samples)); + + if (num_samples == 0) { + return; + } + + AccuracyCudaKernel + <<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(num_samples, + infer_width, + indices_data, + label_data, + correct_data, + accuracy_data, + total_data); +} +} // namespace phi + +// FIXME(typhoonzero): types of T is for inference data. +// label data is always int64 +PD_REGISTER_KERNEL(accuracy, + GPU, + ALL_LAYOUT, + phi::AccuracyKernel, + phi::float16, + phi::bfloat16, + float, + double) { + kernel->InputAt(1).SetDataType(phi::DataType::INT64); + kernel->InputAt(2).SetDataType(phi::DataType::INT64); + kernel->OutputAt(1).SetDataType(phi::DataType::INT32); + kernel->OutputAt(2).SetDataType(phi::DataType::INT32); +} + PD_CUSTOM_KERNEL_REGISTER(accuracy, metax_gpu, ALL_LAYOUT, phi::AccuracyKernel, - phi::dtype::float16, - phi::dtype::bfloat16, + phi::float16, + phi::bfloat16, float, double) { - kernel->InputAt(1).SetDataType(phi::DataType::INT32); - kernel->InputAt(2).SetDataType(phi::DataType::INT32); + kernel->InputAt(1).SetDataType(phi::DataType::INT64); + kernel->InputAt(2).SetDataType(phi::DataType::INT64); kernel->OutputAt(1).SetDataType(phi::DataType::INT32); kernel->OutputAt(2).SetDataType(phi::DataType::INT32); } diff --git a/backends/metax_gpu/patch/tmp/mixed_vector.cc b/backends/metax_gpu/patch/tmp/mixed_vector.cc new file mode 100644 index 00000000000..a90113c7977 --- /dev/null +++ b/backends/metax_gpu/patch/tmp/mixed_vector.cc @@ -0,0 +1,111 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/mixed_vector.h" + +#include +#include +#include +#include // NOLINT +#include +#include + +#include "glog/logging.h" +#include "paddle/phi/backends/context_pool.h" +#include "paddle/phi/common/memory_utils.h" +#include "paddle/utils/none.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void CopyToCPUHelper(std::vector *cpu_, + phi::Allocator::AllocationPtr *gpu_, + size_t *gpu_memory_size_) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // COPY GPU Data To CPU + auto *dev_ctx = static_cast( + phi::DeviceContextPool::Instance().Get((*gpu_)->place())); + auto stream = dev_ctx->stream(); + void *src = (*gpu_)->ptr(); + void *dst = cpu_->data(); + auto place = dev_ctx->GetPlace(); + if (place.GetType() == phi::AllocationType::GPU) { + memory_utils::Copy(phi::CPUPlace(), + dst, + OptionalCUDAPlace(*gpu_).get(), + src, + *gpu_memory_size_, + stream); + } else { + memory_utils::Copy(phi::CPUPlace(), + dst, + OptionalCustomPlace(*gpu_).get(), + src, + *gpu_memory_size_, + stream); + } + dev_ctx->Wait(); +#endif +} + +template +void CopyCPUDataToCUDAHelper(std::vector *cpu_, + phi::Allocator::AllocationPtr *gpu_, + size_t *gpu_memory_size_, + const phi::Place &place) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + void *src = cpu_->data(); + *gpu_memory_size_ = cpu_->size() * sizeof(T); // sizeof(T) + (*gpu_) = memory_utils::Alloc(place, *gpu_memory_size_); + void *dst = (*gpu_)->ptr(); + auto *dev_ctx = static_cast( + phi::DeviceContextPool::Instance().Get(place)); + auto stream = dev_ctx->stream(); + if (place.GetType() == phi::AllocationType::GPU) { + memory_utils::Copy(OptionalCUDAPlace(*gpu_).get(), + dst, + phi::CPUPlace(), + src, + *gpu_memory_size_, + stream); + } else { + memory_utils::Copy(OptionalCustomPlace(*gpu_).get(), + dst, + phi::CPUPlace(), + src, + *gpu_memory_size_, + stream); + } + dev_ctx->Wait(); +#endif +} + +#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__) \ + template <> \ + void MixVector<__TYPE__>::VectorData::CopyToCPU() const { \ + CopyToCPUHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_); \ + } \ + \ + template <> \ + void MixVector<__TYPE__>::VectorData::CopyCPUDataToCUDA( \ + const phi::Place &place) const { \ + CopyCPUDataToCUDAHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_, place); \ + } + +INSTANTIATE_VECTOR_FOR_TYPE(size_t) +INSTANTIATE_VECTOR_FOR_TYPE(int) +INSTANTIATE_VECTOR_FOR_TYPE(int64_t) + +}; // namespace phi diff --git a/backends/metax_gpu/patch/tmp/mixed_vector.h b/backends/metax_gpu/patch/tmp/mixed_vector.h new file mode 100644 index 00000000000..e7cf1e626c9 --- /dev/null +++ b/backends/metax_gpu/patch/tmp/mixed_vector.h @@ -0,0 +1,413 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include // NOLINT +#include +#include + +#include "glog/logging.h" +#include "paddle/common/errors.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/allocator.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/utils/none.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +using Vector = std::vector; + +inline paddle::optional OptionalCUDAPlace( + const phi::Allocator::AllocationPtr &gpu_) { + return gpu_ == nullptr ? paddle::none + : paddle::optional(gpu_->place()); +} + +inline paddle::optional OptionalCustomPlace( + const phi::Allocator::AllocationPtr &gpu_) { + return gpu_ == nullptr ? paddle::none + : paddle::optional(gpu_->place()); +} + +// Vector implements the std::vector interface, and can get Data or +// MutableData from any place. The data will be synced implicitly inside. +template +class MixVector { + public: + using value_type = T; + using iterator = typename std::vector::iterator; + using const_iterator = typename std::vector::const_iterator; + + private: + // The actual class to implement vector logic + class VectorData { + public: + template + explicit VectorData(std::vector *dat) : cpu_(dat), flag_(kDataInCPU) {} + ~VectorData() {} + + VectorData(const VectorData &o) = delete; + + VectorData &operator=(const VectorData &o) = delete; + + T &operator[](size_t i) { + MutableCPU(); + return (*cpu_)[i]; + } + + const T &operator[](size_t i) const { + ImmutableCPU(); + return (*cpu_)[i]; + } + + size_t size() const { return (*cpu_).size(); } + + iterator begin() { + MutableCPU(); + return (*cpu_).begin(); + } + + iterator end() { + MutableCPU(); + return (*cpu_).end(); + } + + T &front() { + MutableCPU(); + return (*cpu_).front(); + } + + T &back() { + MutableCPU(); + return (*cpu_).back(); + } + + const_iterator begin() const { + ImmutableCPU(); + return (*cpu_).begin(); + } + + const_iterator end() const { + ImmutableCPU(); + return (*cpu_).end(); + } + + const T &back() const { + ImmutableCPU(); + return (*cpu_).back(); + } + + T *data() { return cpu_->data(); } + + const T *data() const { return cpu_->data(); } + + const T &front() const { + ImmutableCPU(); + return (*cpu_).front(); + } + + // assign this from iterator. + // NOTE: the iterator must support `end-begin` + template + void assign(Iter begin, Iter end) { + MutableCPU(); + (*cpu_).assign(begin, end); + } + + // push_back. If the previous capacity is not enough, the memory will + // double. + void push_back(T elem) { + MutableCPU(); + (*cpu_).push_back(elem); + } + + // extend a vector by iterator. + // NOTE: the iterator must support end-begin + template + void Extend(It begin, It end) { + MutableCPU(); + auto out_it = std::back_inserter>(*(this->cpu_)); + std::copy(begin, end, out_it); + } + + // resize the vector + void resize(size_t size) { + MutableCPU(); + (*cpu_).resize(size); + } + + // get cuda ptr. immutable + const T *CUDAData(phi::Place place) const { + PADDLE_ENFORCE_EQ( + place.GetType() == phi::AllocationType::GPU || + place.GetType() == phi::AllocationType::CUSTOM, + true, + common::errors::Unavailable( + "Place mismatch, CUDA Data must be on CUDA place.")); + ImmutableCUDA(place); + return reinterpret_cast(gpu_->ptr()); + } + + // get cuda ptr. mutable + T *CUDAMutableData(phi::Place place) { + const T *ptr = CUDAData(place); + flag_ = kDirty | kDataInCUDA; + return const_cast(ptr); + } + + // clear + void clear() { + (*cpu_).clear(); + flag_ = kDirty | kDataInCPU; + } + + std::vector *get_vector() { return cpu_; } + + size_t capacity() const { return (*cpu_).capacity(); } + + // reserve data + void reserve(size_t size) const { (*cpu_).reserve(size); } + + std::mutex &Mutex() const { return mtx_; } + + paddle::optional CUDAPlace() const { + return OptionalCUDAPlace(gpu_); + } + + paddle::optional CustomPlace() const { + return OptionalCustomPlace(gpu_); + } + + void MutableCPU() { + if (IsInCUDA() && IsDirty()) { + CopyToCPU(); + } + flag_ = kDirty | kDataInCPU; + } + + private: + enum DataFlag { + kDataInCPU = 0x01, + kDataInCUDA = 0x02, + // kDirty means the data has been changed in one device. + kDirty = 0x10 + }; + + void CopyToCPU() const; + + void ImmutableCUDA(phi::Place place) const { + if (IsDirty()) { + if (IsInCPU()) { + CopyCPUDataToCUDA(place); + UnsetFlag(kDirty); + SetFlag(kDataInCUDA); + } else if (IsInCUDA() && !(place == gpu_->place())) { + PADDLE_THROW( + common::errors::Unavailable("Unexpected data place mismatch.")); + // Still dirty + } else { + // Dirty && DataInCUDA && Device is same + // Do nothing + } + } else { + if (!IsInCUDA()) { + // Even data is not dirty. However, data is not in CUDA. Copy data. + CopyCPUDataToCUDA(place); + SetFlag(kDataInCUDA); + } else if (!(place == gpu_->place())) { + PADDLE_THROW( + common::errors::Unavailable("Unexpected data place mismatch.")); + } else { + // Not Dirty && DataInCUDA && Device is same + // Do nothing. + } + } + } + + void CopyCPUDataToCUDA(const phi::Place &place) const; + + void ImmutableCPU() const { + if (IsDirty() && !IsInCPU()) { // If data has been changed in CUDA, or + // CPU has no data. + CopyToCPU(); + UnsetFlag(kDirty); + } + SetFlag(kDataInCPU); + } + + void UnsetFlag(int flag) const { flag_ &= ~flag; } + void SetFlag(int flag) const { flag_ |= flag; } + + bool IsDirty() const { return flag_ & kDirty; } + + bool IsInCUDA() const { return flag_ & kDataInCUDA; } + + bool IsInCPU() const { return flag_ & kDataInCPU; } + + std::vector *cpu_; + mutable phi::Allocator::AllocationPtr gpu_; + mutable size_t gpu_memory_size_{0}; + mutable int flag_; + + mutable std::mutex mtx_; + }; + + public: + // implicit cast from std::vector. + template + MixVector(const std::vector *dat) { // NOLINT + m_.reset(new VectorData(const_cast *>(dat))); + } + + // Copy ctor + MixVector(const MixVector &other) = delete; + + // Copy operator + MixVector &operator=(const MixVector &other) = delete; + + // Move ctor + MixVector(MixVector &&other) = delete; + + // CPU data access method. Mutable. + T &operator[](size_t i) { return (*m_)[i]; } + + // CPU data access method. Immutable. + const T &operator[](size_t i) const { return (*m_)[i]; } + + // std::vector iterator methods. Based on CPU data access method + size_t size() const { return m_->size(); } + + iterator begin() { return m_->begin(); } + + iterator end() { return m_->end(); } + + T &front() { return m_->front(); } + + T &back() { return m_->back(); } + + const_iterator begin() const { return m_->begin(); } + + const_iterator end() const { return m_->end(); } + + const_iterator cbegin() const { return begin(); } + + const_iterator cend() const { return end(); } + + const T &back() const { return m_->back(); } + + T *data() { return m_->data(); } + + const T *data() const { return m_->data(); } + + const T &front() const { return m_->front(); } + // end of std::vector iterator methods + + // assign this from iterator. + // NOTE: the iterator must support `end-begin` + template + void assign(Iter begin, Iter end) { + m_->assign(begin, end); + } + + // push_back. If the previous capacity is not enough, the memory will + // double. + void push_back(T elem) { m_->push_back(elem); } + + // extend a vector by iterator. + // NOTE: the iterator must support end-begin + template + void Extend(It begin, It end) { + m_->Extend(begin, end); + } + + // resize the vector + void resize(size_t size) { + if (m_->size() != size) { + m_->resize(size); + } + } + + // get cuda ptr. immutable + const T *CUDAData(phi::Place place) const { + { + phi::GPUPlace p(place.GetDeviceId()); + auto &mtx = m_->Mutex(); + std::lock_guard guard(mtx); + auto cuda_place = m_->CUDAPlace(); + if (cuda_place == paddle::none || cuda_place == p) { + return m_->CUDAData(place); + } + } + m_->MutableCPU(); + m_.reset(new VectorData(m_->get_vector())); + return CUDAData(place); + } + + // get cuda ptr. mutable + T *CUDAMutableData(phi::Place place) { + { + phi::GPUPlace p(place.GetDeviceId()); + auto &mtx = m_->Mutex(); + std::lock_guard guard(mtx); + auto cuda_place = m_->CUDAPlace(); + if (cuda_place == paddle::none || cuda_place == p) { + return m_->CUDAMutableData(place); + } + } + m_->MutableCPU(); + m_.reset(new VectorData(m_->get_vector())); + return CUDAMutableData(place); + } + + // clear + void clear() { m_->clear(); } + + size_t capacity() const { return m_->capacity(); } + + // reserve data + void reserve(size_t size) { m_->reserve(size); } + + // the unify method to access CPU or CUDA data. immutable. + const T *Data(phi::Place place) const { + if (place.GetType() == phi::AllocationType::GPU) { + return CUDAData(place); + } else { + return data(); + } + } + + // the unify method to access CPU or CUDA data. mutable. + T *MutableData(phi::Place place) { + if (place.GetType() == phi::AllocationType::GPU) { + return CUDAMutableData(place); + } else { + return data(); + } + } + + void CopyToCPU() { m_->MutableCPU(); } + + const void *Handle() const { return m_.get(); } + + private: + mutable std::unique_ptr m_; +}; + +}; // namespace phi diff --git a/backends/metax_gpu/tests/unittest/test_accuracy_op_metax.py b/backends/metax_gpu/tests/unittest/test_accuracy_op_metax.py new file mode 100644 index 00000000000..910ef5cd1a6 --- /dev/null +++ b/backends/metax_gpu/tests/unittest/test_accuracy_op_metax.py @@ -0,0 +1,206 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from op_test import ( + OpTest, + convert_float_to_uint16, + paddle_static_guard, + is_custom_device, + get_device_place, +) + +import paddle +from paddle import base +from paddle.base import Program, core, program_guard + + +def accuracy_wrapper(infer, indices, label): + return paddle._C_ops.accuracy(infer, indices, label) + + +class TestAccuracyOp(OpTest): + def setUp(self): + self.op_type = "accuracy" + self.python_api = accuracy_wrapper + self.dtype = np.float32 + self.init_dtype() + n = 8192 + infer = np.random.random((n, 1)).astype(self.dtype) + indices = np.random.randint(0, 2, (n, 1)).astype("int64") + label = np.random.randint(0, 2, (n, 1)).astype("int64") + self.inputs = {"Out": infer, "Indices": indices, "Label": label} + num_correct = 0 + for rowid in range(n): + for ele in indices[rowid]: + if ele == label[rowid]: + num_correct += 1 + break + self.outputs = { + "Accuracy": np.array(num_correct / float(n)).astype(self.dtype), + "Correct": np.array(num_correct).astype("int32"), + "Total": np.array(n).astype("int32"), + } + + def init_dtype(self): + pass + + def test_check_output(self): + self.check_output(check_pir=True) + + +class TestAccuracyOpFp16(TestAccuracyOp): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output(atol=1e-3, check_pir=True) + + +@unittest.skipIf( + not (core.is_compiled_with_cuda() or is_custom_device()) + or not core.is_bfloat16_supported(get_device_place()), + "core is not compiled with CUDA and not support the bfloat16", +) +class TestAccuracyOpBf16(OpTest): + def setUp(self): + self.op_type = "accuracy" + self.python_api = accuracy_wrapper + self.init_dtype() + n = 8192 + infer = np.random.random((n, 1)).astype(np.float32) + indices = np.random.randint(0, 2, (n, 1)).astype("int64") + label = np.random.randint(0, 2, (n, 1)).astype("int64") + self.inputs = { + "Out": convert_float_to_uint16(infer), + "Indices": indices, + "Label": label, + } + num_correct = 0 + for rowid in range(n): + for ele in indices[rowid]: + if ele == label[rowid]: + num_correct += 1 + break + self.outputs = { + "Accuracy": convert_float_to_uint16( + np.array(num_correct / float(n)).astype(np.float32) + ), + "Correct": np.array(num_correct).astype("int32"), + "Total": np.array(n).astype("int32"), + } + + def init_dtype(self): + self.dtype = np.uint16 + + def test_check_output(self): + if core.is_compiled_with_cuda() or is_custom_device(): + place = get_device_place() + self.check_output_with_place(place, atol=1e-2, check_pir=True) + + +class TestAccuracyOpError(unittest.TestCase): + def test_type_errors(self): + with ( + paddle_static_guard(), + program_guard(Program(), Program()), + ): + # The input type of accuracy_op must be Variable. + x1 = base.create_lod_tensor(np.array([[-1]]), [[1]], base.CPUPlace()) + label = paddle.static.data(name="label", shape=[-1, 1], dtype="int32") + self.assertRaises(TypeError, paddle.static.accuracy, x1, label) + self.assertRaises(TypeError, paddle.metric.accuracy, x1, label) + # The input dtype of accuracy_op must be float32 or float64. + x2 = paddle.static.data(name="x2", shape=[-1, 4], dtype="int32") + self.assertRaises(TypeError, paddle.static.accuracy, x2, label) + self.assertRaises(TypeError, paddle.metric.accuracy, x2, label) + + x3 = paddle.static.data(name="input", shape=[-1, 2], dtype="float32") + paddle.static.accuracy(input=x3, label=label) + paddle.metric.accuracy(input=x3, label=label) + + def test_value_errors(self): + with ( + program_guard(Program(), Program()), + # The input rank of accuracy_op must be 2. + self.assertRaises(ValueError), + ): + x3 = paddle.to_tensor([0.1], dtype="float32") + label3 = paddle.to_tensor(np.reshape([0], [1, 1]), dtype="int32") + paddle.metric.accuracy(x3, label3) + + +class TestAccuracyAPI1(unittest.TestCase): + def run_api(self, accuracy_api): + with ( + paddle_static_guard(), + paddle.static.program_guard(paddle.static.Program()), + ): + self.predictions = paddle.static.data( + shape=[2, 5], name="predictions", dtype="float32" + ) + self.label = paddle.static.data(shape=[2, 1], name="labels", dtype="int64") + self.result = accuracy_api(input=self.predictions, label=self.label, k=1) + self.input_predictions = np.array( + [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]], + dtype="float32", + ) + self.input_labels = np.array([[2], [0]], dtype="int64") + self.expect_value = np.array([0.5], dtype="float32") + exe = paddle.static.Executor() + (result,) = exe.run( + feed={ + "predictions": self.input_predictions, + "labels": self.input_labels, + }, + fetch_list=[self.result], + ) + self.assertEqual((result == self.expect_value).all(), True) + + def test_api(self): + self.run_api(accuracy_api=paddle.static.accuracy) + self.run_api(accuracy_api=paddle.metric.accuracy) + + +class TestAccuracyAPI2(unittest.TestCase): + def test_api(self): + with base.dygraph.guard(): + predictions = paddle.to_tensor( + [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]], + dtype="float32", + ) + label = paddle.to_tensor([[2], [0]], dtype="int64") + result = paddle.static.accuracy(input=predictions, label=label, k=1) + expect_value = np.array([0.5], dtype="float32") + self.assertEqual((result.numpy() == expect_value).all(), True) + + +class TestAccuracyAPI(unittest.TestCase): + def test_api(self): + with base.dygraph.guard(): + predictions = paddle.to_tensor( + [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]], + dtype="float32", + ) + label = paddle.to_tensor([[2], [0]], dtype="int64") + result = paddle.metric.accuracy(input=predictions, label=label, k=1) + expect_value = np.array([0.5], dtype="float32") + + self.assertEqual((result.numpy() == expect_value).all(), True) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unittest/test_gather_op_metax.py b/backends/metax_gpu/tests/unittest/test_gather_op_metax.py index bdf116571f7..3ce39588838 100644 --- a/backends/metax_gpu/tests/unittest/test_gather_op_metax.py +++ b/backends/metax_gpu/tests/unittest/test_gather_op_metax.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,14 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import print_function import unittest -from op_test import OpTest import numpy as np -import paddle +from op_test import ( + OpTest, + convert_float_to_uint16, + get_devices, + is_custom_device, + get_device_place, +) +from utils import dygraph_guard -paddle.enable_static() +import paddle +from paddle import base +from paddle.base.dygraph.base import switch_to_static_graph +from paddle.framework import core def gather_numpy(x, index, axis): @@ -32,29 +40,119 @@ def gather_numpy(x, index, axis): class TestGatherOp(OpTest): def setUp(self): self.op_type = "gather" - self.place = paddle.CustomPlace("metax_gpu", 0) - self.__class__.use_custom_device = True self.python_api = paddle.gather + self.public_python_api = paddle.gather self.config() - xnp = np.random.random(self.x_shape).astype(self.x_type) - self.inputs = {"X": xnp, "Index": np.array(self.index).astype(self.index_type)} - self.outputs = {"Out": self.inputs["X"][self.inputs["Index"]]} + self.prim_op_type = "prim" + self.init_inputs_and_outputs() + self.if_enable_cinn() def test_check_output(self): - self.check_output_with_place(self.place) + self.check_output(check_pir=True, check_symbol_infer=False) def test_check_grad(self): - self.check_grad_with_place(self.place, ["X"], "Out") + self.check_grad(["X"], "Out", check_pir=True, check_prim_pir=True) def config(self): """ For multi-dimension input """ self.x_shape = (10, 20) - self.x_type = "float32" + self.config_dtype() self.index = [1, 3, 5] self.index_type = "int32" + def config_dtype(self): + self.x_type = "float64" + + def init_inputs_and_outputs(self): + xnp = np.random.random(self.x_shape).astype(self.x_type) + if self.x_type == "complex64" or self.x_type == "cpmolex128": + xnp = ( + np.random.randint(-10, 10, size=(10, 10)) + + 1j * np.random.randint(-10, 10, size=(10, 10)) + ).astype(self.x_type) + self.inputs = { + "X": xnp, + "Index": np.array(self.index).astype(self.index_type), + } + self.outputs = {"Out": self.inputs["X"][self.inputs["Index"]]} + + def if_enable_cinn(self): + pass + + +class TestGatherOp_ZeroDim(TestGatherOp): + def config(self): + """ + For multi-dimension input + """ + self.x_shape = 100 + self.config_dtype() + self.index = 2 + self.index_type = "int32" + + def if_enable_cinn(self): + self.enable_cinn = False + + +class TestGatherOpFP16(TestGatherOp): + def config_dtype(self): + self.x_type = "float16" + + +# @unittest.skipIf( +# not (core.is_compiled_with_cuda() or is_custom_device()) +# # or core.cudnn_version() < 8100 +# # or paddle.device.cuda.get_device_capability()[0] < 8, +# # "only support compiled with CUDA and cudnn version need larger than 8.1.0 and device's compute capability is at least 8.0", +# ) +class TestGatherOpBFP16(TestGatherOp): + def config_dtype(self): + self.x_type = "float32" + self.dtype = np.uint16 + + def init_inputs_and_outputs(self): + xnp = np.random.random(self.x_shape).astype(self.x_type) + self.inputs = { + "X": convert_float_to_uint16(xnp), + "Index": np.array(self.index).astype(self.index_type), + } + self.outputs = {"Out": convert_float_to_uint16(xnp[self.inputs["Index"]])} + + def if_enable_cinn(self): + self.enable_cinn = False + + def test_check_output(self): + self.check_output_with_place( + place=get_device_place(), check_pir=True, check_symbol_infer=False + ) + + def test_check_grad(self): + self.check_grad_with_place( + get_device_place(), + ["X"], + "Out", + check_pir=True, + check_prim_pir=True, + ) + + +class TestGatherOpComplex64(TestGatherOp): + def config_dtype(self): + self.x_type = "complex64" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestGatherOpComplex128(TestGatherOp): + def config_dtype(self): + self.x_type = "complex128" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + class TestCase1(TestGatherOp): def config(self): @@ -62,10 +160,42 @@ def config(self): For one dimension input """ self.x_shape = 100 - self.x_type = "float32" + self.config_dtype() self.index = [1, 3, 5] self.index_type = "int32" + def config_dtype(self): + self.x_type = "float64" + + +class TestCase1FP16(TestCase1): + def config_dtype(self): + self.x_type = "float16" + + +class TestCase1BFP16(TestGatherOpBFP16): + def config(self): + self.x_shape = 100 + self.config_dtype() + self.index = [1, 3, 5] + self.index_type = "int32" + + +class TestCase1Complex64(TestCase1): + def config_dtype(self): + self.x_type = "complex64" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestCase1Complex128(TestCase1): + def config_dtype(self): + self.x_type = "complex128" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + class TestCase2(TestGatherOp): def config(self): @@ -73,42 +203,574 @@ def config(self): For int64_t index type """ self.x_shape = 100 - self.x_type = "float32" + self.config_dtype() + self.index = [1, 3, 5] + self.index_type = "int64" + + def config_dtype(self): + self.x_type = "float64" + + +class TestCase2FP16(TestCase2): + def config_dtype(self): + self.x_type = "float16" + + +class TestCase2BFP16(TestGatherOpBFP16): + def config(self): + self.x_shape = 100 + self.config_dtype() + self.index = [1, 3, 5] + self.index_type = "int64" + + +class TestCase2Complex64(TestCase2): + def config_dtype(self): + self.x_type = "complex64" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestCase2Complex128(TestCase2): + def config_dtype(self): + self.x_type = "complex128" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestCase3(TestGatherOp): + def config(self): + """ + For other input type + """ + self.x_shape = (10, 20) + self.config_dtype() + self.index = [1, 3, 5] + self.index_type = "int64" + + def config_dtype(self): + self.x_type = "float64" + + +class TestCase3Fp16(TestCase3): + def config_dtype(self): + self.x_type = "float16" + + +class TestCase3BFP16(TestGatherOpBFP16): + def config(self): + self.x_shape = (10, 20) + self.config_dtype() self.index = [1, 3, 5] self.index_type = "int64" +class TestCase3Complex64(TestCase3): + def config_dtype(self): + self.x_type = "complex64" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestCase3Complex128(TestCase3): + def config_dtype(self): + self.x_type = "complex128" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestCase4(TestGatherOp): + def config(self): + self.x_shape = (10, 20) + self.attrs = {"overwrite": False} + self.config_dtype() + self.index = [1, 1] + self.index_type = "int32" + + def config_dtype(self): + self.x_type = "float64" + + +class TestCase4FP16(TestCase4): + def config_dtype(self): + self.x_type = "float16" + + +class TestCase4BFP16(TestGatherOpBFP16): + def config(self): + self.x_shape = (10, 20) + self.attrs = {"overwrite": False} + self.config_dtype() + self.index = [1, 1] + self.index_type = "int32" + + +class TestCase4Complex64(TestCase4): + def config_dtype(self): + self.x_type = "complex64" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestCase4Complex128(TestCase4): + def config_dtype(self): + self.x_type = "complex128" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestCase5(TestGatherOp): + def config(self): + self.x_shape = (10, 20) + self.attrs = {"overwrite": False} + self.config_dtype() + self.index = [1, 1, 3] + self.index_type = "int32" + + def config_dtype(self): + self.x_type = "float64" + + +class TestCase5BFP16(TestGatherOpBFP16): + def config(self): + self.x_shape = (10, 20) + self.attrs = {"overwrite": False} + self.config_dtype() + self.index = [1, 1] + self.index_type = "int32" + + +class TestCase5FP16(TestCase5): + def config_dtype(self): + self.x_type = "float16" + + +class TestCase5Complex64(TestCase5): + def config_dtype(self): + self.x_type = "complex64" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestCase5Complex128(TestCase5): + def config_dtype(self): + self.x_type = "complex128" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestCase6(TestGatherOp): + def config(self): + self.x_shape = (10, 20) + self.attrs = {"overwrite": True} + self.config_dtype() + self.index = [1, 3] + self.index_type = "int32" + + def config_dtype(self): + self.x_type = "float64" + + +class TestCase6FP16(TestCase6): + def config_dtype(self): + self.x_type = "float16" + + +class TestCase6BFP16(TestGatherOpBFP16): + def config(self): + self.x_shape = (10, 20) + self.attrs = {"overwrite": True} + self.config_dtype() + self.index = [1, 3] + self.index_type = "int32" + + +class TestGatherBF16Op(OpTest): + def setUp(self): + self.op_type = "gather" + self.python_api = paddle.gather + self.dtype = np.uint16 + self.config() + xnp = np.random.random(self.x_shape).astype(np.float32) + axis_np = np.array(self.axis).astype(self.axis_type) + index_np = np.array(self.index).astype(self.index_type) + self.inputs = { + "X": convert_float_to_uint16(xnp), + "Index": index_np, + "Axis": axis_np, + } + out = gather_numpy(self.inputs["X"], index_np, axis_np[0]) + self.outputs = {"Out": out} + + def test_check_output(self): + self.check_output(check_pir=True, check_symbol_infer=False) + + def test_check_grad(self): + self.check_grad(["X"], "Out", numeric_grad_delta=0.5, check_pir=True) + + def config(self): + """ + For multi-dimension input + """ + self.x_shape = (3, 88, 3) + self.index = [1, 3, 5] + self.index_type = "int32" + self.axis = [1] + self.axis_type = "int32" + + +class TestGatherNegativeAxis(OpTest): + def setUp(self): + self.op_type = "gather" + self.python_api = paddle.gather + self.dtype = np.uint16 + self.config() + xnp = np.random.random(self.x_shape).astype(np.float32) + axis_np = np.array(self.axis).astype(self.axis_type) + index_np = np.array(self.index).astype(self.index_type) + self.inputs = { + "X": convert_float_to_uint16(xnp), + "Index": index_np, + "Axis": axis_np, + } + out = gather_numpy(self.inputs["X"], index_np, axis_np[0]) + self.outputs = {"Out": out} + + def test_check_output(self): + places = [paddle.CPUPlace()] + if core.is_compiled_with_cuda() or is_custom_device(): + places.append(get_device_place()) + for place in places: + self.check_output_with_place(place) + + def test_check_grad(self): + places = [paddle.CPUPlace()] + if core.is_compiled_with_cuda() or is_custom_device(): + places.append(get_device_place()) + for place in places: + self.check_grad_with_place(place, ["X"], "Out", numeric_grad_delta=0.5) + + def config(self): + """ + For multi-dimension input + """ + self.x_shape = (100, 3) + self.index = [0, 1, -2] + self.index_type = "int32" + self.axis = [-1] + self.axis_type = "int32" + + +class TestOutOfRangeError(unittest.TestCase): + def test_dygraph_forward_and_backward(self): + with dygraph_guard(): + x = paddle.randn([100, 3]).cpu() + x.stop_gradient = False + y = paddle.gather( + x, + paddle.to_tensor([0, -2]).cpu(), + axis=-1, + ) + grad_x = paddle.grad(y, x) + + def test_dygraph_error(self): + with dygraph_guard(): + # out of lower bound + with self.assertRaises(IndexError): + _ = paddle.gather( + paddle.randn([100, 3]).cpu(), + paddle.to_tensor([0, -4]).cpu(), + axis=1, + ) + # out of upper bound + with self.assertRaises(IndexError): + _ = paddle.gather( + paddle.randn([100, 3]).cpu(), + paddle.to_tensor([0, 3]).cpu(), + axis=1, + ) + + +class TestCase6Complex64(TestCase6): + def config_dtype(self): + self.x_type = "complex64" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestCase6Complex128(TestCase6): + def config_dtype(self): + self.x_type = "complex128" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestGatherOp1(OpTest): + def setUp(self): + self.op_type = "gather" + self.python_api = paddle.gather + self.config() + xnp = np.random.random(self.x_shape).astype(self.x_type) + axis_np = np.array(self.axis).astype(self.index_type) + index_np = np.array(self.index).astype(self.index_type) + out = gather_numpy(xnp, index_np, axis_np[0]) + self.inputs = {"X": xnp, "Index": index_np, "Axis": axis_np} + self.outputs = {"Out": out} + + def test_check_output(self): + self.check_output(check_pir=True, check_symbol_infer=False) + + def test_check_grad(self): + self.check_grad(["X"], "Out", check_pir=True) + + def config(self): + """ + For multi-dimension input + """ + self.x_shape = (3, 88, 3) + self.config_dtype() + self.index = [1, 3, 5] + self.index_type = "int32" + self.axis = [1] + self.axis_type = "int32" + + def config_dtype(self): + self.x_type = "float64" + + +class TestGatherOp1FP16(TestGatherOp1): + def config_dtype(self): + self.x_type = "float16" + + +class TestGatherOp1Complex64(TestGatherOp1): + def config_dtype(self): + self.x_type = "complex64" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestGatherOp1Complex128(TestGatherOp1): + def config_dtype(self): + self.x_type = "complex128" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestGatherOp2(TestGatherOp1): + def config(self): + """ + For multi-dimension input + """ + self.x_shape = (10, 88, 10) + self.config_dtype() + self.index = [1, 3, 5] + self.index_type = "int64" + self.axis = [0] + self.axis_type = "int32" + + def config_dtype(self): + self.x_type = "float64" + + +class TestGatherOp2FP16(TestGatherOp2): + def config_dtype(self): + self.x_type = "float16" + + +class TestGatherOp2Complex64(TestGatherOp2): + def config_dtype(self): + self.x_type = "complex64" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestGatherOp2Complex128(TestGatherOp2): + def config_dtype(self): + self.x_type = "complex128" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestGatherOp3(TestGatherOp1): + def config(self): + """ + For multi-dimension input + """ + self.x_shape = (10, 88, 10) + self.config_dtype() + self.index = [1, 3, 5] + self.index_type = "int64" + self.axis = [2] + self.axis_type = "int32" + + def config_dtype(self): + self.x_type = "float64" + + +class TestGatherOp3FP16(TestGatherOp3): + def config_dtype(self): + self.x_type = "float16" + + +class TestGatherOp3Complex64(TestGatherOp3): + def config_dtype(self): + self.x_type = "complex64" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestGatherOp3Complex128(TestGatherOp3): + def config_dtype(self): + self.x_type = "complex128" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestGatherOp4(TestGatherOp1): + def config(self): + """ + For multi-dimension input + """ + self.x_shape = (3, 100, 10) + self.config_dtype() + self.index = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + self.index_type = "int64" + self.axis = [0] + self.axis_type = "int32" + self.attrs = {"overwrite": False} + + def config_dtype(self): + self.x_type = "float64" + + +class TestGatherOp4FP16(TestGatherOp4): + def config_dtype(self): + self.x_type = "float16" + + +class TestGatherOp4Complex64(TestGatherOp4): + def config_dtype(self): + self.x_type = "complex64" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestGatherOp4Complex128(TestGatherOp4): + def config_dtype(self): + self.x_type = "complex128" + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestGatherOp5(TestGatherOp): + def config(self): + """ + Test for negative axis + """ + self.x_shape = (3, 100, 10) + self.config_dtype() + self.index = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + self.index_type = "int64" + self.axis = [-1] + self.axis_type = "int32" + self.attrs = {"overwrite": False} + + def config_dtype(self): + self.x_type = "float64" + + def test_check_grad(self): + self.check_grad( + ["X"], + "Out", + check_pir=True, + check_prim_pir=True, + ) + + +class API_TestGather(unittest.TestCase): + def test_out1(self): + with base.program_guard(base.Program(), base.Program()): + data1 = paddle.static.data("data1", shape=[-1, 2], dtype="float64") + index = paddle.static.data("index", shape=[-1, 1], dtype="int64") + out = paddle.gather(data1, index) + place = base.CPUPlace() + exe = base.Executor(place) + input = np.array([[1, 2], [3, 4], [5, 6]]).astype("float64") + index_1 = np.array([1, 2]).astype("int64") + (result,) = exe.run( + feed={"data1": input, "index": index_1}, fetch_list=[out] + ) + expected_output = np.array([[3, 4], [5, 6]]) + np.testing.assert_allclose(result, expected_output, rtol=1e-05) + + def test_out2(self): + with paddle.static.program_guard( + paddle.static.Program(), paddle.static.Program() + ): + x = paddle.static.data("x", shape=[-1, 2], dtype="float64") + index = paddle.static.data("index", shape=[-1, 1], dtype="int32") + axis = paddle.static.data("axis", shape=[1], dtype="int32") + out = paddle.gather(x, index, axis) + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + x_np = np.array([[1, 2], [3, 4], [5, 6]]).astype("float64") + index_np = np.array([1, 1]).astype("int32") + axis_np = np.array([1]).astype("int32") + (result,) = exe.run( + feed={"x": x_np, "index": index_np, "axis": axis_np}, + fetch_list=[out], + ) + expected_output = gather_numpy(x_np, index_np, axis_np[0]) + np.testing.assert_allclose(result, expected_output, rtol=1e-05) + + class API_TestDygraphGather(unittest.TestCase): def test_out1(self): - paddle.set_device("metax_gpu") paddle.disable_static() - input_1 = np.array([[1, 2], [3, 4], [5, 6]]).astype("int32") + input_1 = np.array([[1, 2], [3, 4], [5, 6]]) index_1 = np.array([1, 2]) input = paddle.to_tensor(input_1) index = paddle.to_tensor(index_1) output = paddle.gather(input, index) output_np = output.numpy() - expected_output = np.array([[3, 4], [5, 6]]).astype("int32") - np.testing.assert_allclose(output_np, expected_output) + expected_output = np.array([[3, 4], [5, 6]]) + np.testing.assert_allclose(output_np, expected_output, rtol=1e-05) paddle.enable_static() def test_out12(self): - paddle.set_device("metax_gpu") paddle.disable_static() - input_1 = np.array([[1, 2], [3, 4], [5, 6]]).astype("int32") + input_1 = np.array([[1, 2], [3, 4], [5, 6]]) index_1 = np.array([1, 2]) x = paddle.to_tensor(input_1) index = paddle.to_tensor(index_1) output = paddle.gather(x, index, axis=0) output_np = output.numpy() expected_output = gather_numpy(input_1, index_1, axis=0) - np.testing.assert_allclose(output_np, expected_output) + np.testing.assert_allclose(output_np, expected_output, rtol=1e-05) paddle.enable_static() def test_zero_index(self): - paddle.set_device("metax_gpu") paddle.disable_static() - x = paddle.to_tensor([[1, 2], [3, 4]]).astype("int32") + x = paddle.to_tensor([[1, 2], [3, 4]]) index = paddle.to_tensor(np.array([]).astype("int64")) for axis in range(len(x.shape)): out = paddle.gather(x, index, axis) @@ -117,122 +779,197 @@ def test_zero_index(self): self.assertEqual(list(out.shape), expected_shape) paddle.enable_static() + def test_large_data(self): + if not paddle.is_compiled_with_cuda(): + return -class TestGathertError(unittest.TestCase): - def setUp(self) -> None: - self.place = paddle.CustomPlace("metax_gpu", 0) - paddle.set_device("metax_gpu:0") + x = np.random.rand(226862, 256).astype("float32") + index = np.random.randint(-226862, 22682, size=(8859027)) - def test_error1(self): - paddle.enable_static() - if not paddle.framework.use_pir_api(): + def test_dygraph(): + with base.dygraph.guard(): + gpu_out = paddle.gather(paddle.to_tensor(x), paddle.to_tensor(index)) + return gpu_out.numpy() + + @switch_to_static_graph + def test_static_graph(): with paddle.static.program_guard( paddle.static.Program(), paddle.static.Program() ): - - input_shape = [8, 9, 6] - index_shape = [4] - x_int8 = paddle.static.data( - shape=input_shape, dtype="int8", name="x_int8" - ) - x_float32 = paddle.static.data( - shape=input_shape, dtype="float32", name="x_float32" - ) - axis = paddle.static.data(shape=[1], dtype="float32", name="axis") - index = paddle.static.data( - shape=index_shape, dtype="int32", name="index" - ) - index_float = paddle.static.data( - shape=index_shape, dtype="float32", name="index_float" + x_t = paddle.static.data(name="x", dtype=x.dtype, shape=x.shape) + index_t = paddle.static.data( + name="index", dtype=index.dtype, shape=index.shape ) + out_t = paddle.gather(x_t, index_t) + feed = {x_t.name: x, index_t.name: index} + fetch = [out_t] - def test_x_type(): - paddle.gather(x_int8, index) + gpu_exe = paddle.static.Executor(get_device_place()) + gpu_value = gpu_exe.run(feed=feed, fetch_list=fetch)[0] + return gpu_value - self.assertRaises(TypeError, test_x_type) + np.testing.assert_array_equal(test_dygraph(), test_static_graph()) - def test_index_type(): - paddle.gather(x_float32, index_float) - self.assertRaises(TypeError, test_index_type) +class TestGathertError(unittest.TestCase): + def test_error1(self): + with paddle.static.program_guard( + paddle.static.Program(), paddle.static.Program() + ): + shape = [8, 9, 6] + x = paddle.static.data(shape=shape, dtype="int8", name="x") + axis = paddle.static.data(shape=[1], dtype="float32", name="axis") + index = paddle.static.data(shape=shape, dtype="int32", name="index") + index_float = paddle.static.data( + shape=shape, dtype="float32", name="index_float" + ) + + def test_x_type(): + paddle.gather(x, index) + + self.assertRaises((TypeError, ValueError), test_x_type) + + def test_index_type(): + paddle.gather(x, index_float) + + self.assertRaises((TypeError, ValueError), test_index_type) + + def test_axis_dtype(): + paddle.gather(x, index, axis=1.11) - def test_axis_dtype(): - paddle.gather(x_float32, index, axis=1.11) + self.assertRaises((TypeError, ValueError), test_axis_dtype) - self.assertRaises(TypeError, test_axis_dtype) + def test_axis_dtype1(): + paddle.gather(x, index, axis=axis) - def test_axis_dtype1(): - paddle.gather(x_float32, index, axis=axis) + self.assertRaises((TypeError, ValueError), test_axis_dtype1) - self.assertRaises(TypeError, test_axis_dtype1) - else: - paddle.set_device("metax_gpu") - input_shape = [8, 9, 6] - index_shape = [4] + def test_error2(self): + with paddle.static.program_guard( + paddle.static.Program(), paddle.static.Program() + ): + shape = [8, 9, 6] + x = paddle.static.data(shape=shape, dtype="int8", name="x") + index = paddle.static.data(shape=shape, dtype="int32", name="mask") + index_float = paddle.static.data( + shape=shape, dtype="float32", name="index_float" + ) + + def test_x_type(): + paddle.gather(x, index) + + self.assertRaises((TypeError, ValueError), test_x_type) def test_index_type(): - with paddle.static.program_guard( - paddle.static.Program(), paddle.static.Program() - ): - x = paddle.static.data(shape=input_shape, dtype="float32", name="x") - index = paddle.static.data( - shape=index_shape, dtype="float32", name="index_float" - ) - out = paddle.gather(x, index) - exe = paddle.static.Executor(place=self.place) - exe.run(paddle.static.default_startup_program()) - self.assertRaises( - ValueError, - exe.run, - paddle.static.default_main_program(), - feed={ - "x": np.random.random(input_shape).astype("float32"), - "index_float": np.random.random(index_shape).astype( - "float32" - ), - }, - ) - - def test_axis_scalar_dtype(): - with paddle.static.program_guard( - paddle.static.Program(), paddle.static.Program() - ): - x = paddle.static.data(shape=input_shape, dtype="float32", name="x") - index = paddle.static.data( - shape=index_shape, dtype="int32", name="index" - ) - axis = paddle.static.data(shape=[1], dtype="int32", name="axis") - self.assertRaises(TypeError, paddle.gather, x, index, axis=1.11) - - def test_axis_tensor_dtype(): - with paddle.static.program_guard( - paddle.static.Program(), paddle.static.Program() - ): - x = paddle.static.data(shape=input_shape, dtype="float32", name="x") - index = paddle.static.data( - shape=index_shape, dtype="int32", name="index" - ) - axis = paddle.static.data(shape=[1], dtype="float32", name="axis") - y = paddle.gather(x, index, axis=axis) - exe = paddle.static.Executor(place=self.place) - exe.run(paddle.static.default_startup_program()) - self.assertRaises( - ValueError, - exe.run, - paddle.static.default_main_program(), - feed={ - "x": np.random.random(input_shape).astype("float32"), - "index": np.random.randint(0, 8, index_shape).astype( - "int32" - ), - "axis": np.array([1.11]).astype("float32"), - }, - ) - - test_index_type() - test_axis_scalar_dtype() - # test_axis_tensor_dtype() + paddle.gather(x, index_float) + + self.assertRaises((TypeError, ValueError), test_index_type) + + def test_error3(self): + with paddle.static.program_guard( + paddle.static.Program(), paddle.static.Program() + ): + shape = [8, 9, 6] + x = paddle.static.data(shape=shape, dtype="int32", name="x") + index = paddle.static.data(shape=shape, dtype="int32", name="index") + + def test_axis_minsize(): + paddle.gather(x, index, axis=-1) + + self.assertRaises(ValueError, test_axis_minsize) + + def test_axis_maxsize(): + paddle.gather(x, index, axis=512) + + self.assertRaises(ValueError, test_axis_maxsize) + + +class TestCheckOutType(unittest.TestCase): + def test_out_type(self): + data = paddle.static.data(shape=[16, 10], dtype="int64", name="x") + index = paddle.static.data(shape=[4], dtype="int64", name="index") + out = paddle.gather(data, index) + self.assertTrue(out.dtype == paddle.int64 or out.dtype == core.DataType.INT64) + + def test_pir_out_type(self): + with paddle.pir_utils.IrGuard(): + data = paddle.static.data(shape=[16, 10], dtype="int64", name="x") + index = paddle.static.data(shape=[4], dtype="int64", name="index") + out = paddle.gather(data, index) + self.assertTrue(out.dtype == core.DataType.INT64) + + +class TestGatherBackward(unittest.TestCase): + def setUp(self): + self.shape = [10, 20] + self.dtype = "float32" + self.index = (1, 3, 5) + self.index_dtype = "int64" + self.places = get_devices() + + def test_gather_backward(self): + if len(self.places) != 2: + return + res_list = [] + x_np = np.random.random(self.shape).astype(self.dtype) + index_np = np.array(self.index, dtype=self.index_dtype) + grad_out_np = np.random.random(self.shape).astype(self.dtype) + for place in self.places: + with base.dygraph.guard(place): + x = paddle.to_tensor(x_np, dtype=self.dtype) + x.stop_gradient = False + index = paddle.to_tensor(index_np, dtype=self.index_dtype) + out = paddle.gather(x, index, -1) + grad_out = paddle.to_tensor(grad_out_np, dtype=self.dtype) + (re,) = paddle.grad( + outputs=out, + inputs=x, + grad_outputs=grad_out, + ) + res_list.append(re.numpy()) + np.testing.assert_allclose(res_list[0], res_list[1]) + + +class TestGatherOp_ZeroSize(OpTest): + def setUp(self): + self.op_type = "gather" + self.python_api = paddle.gather + self.public_python_api = paddle.gather + self.config() + self.init_inputs_and_outputs() + + def test_check_output(self): + self.check_output(check_pir=True) + + def test_check_grad(self): + self.check_grad(["X"], "Out", check_pir=True) + + def config(self): + self.x_shape = (3, 0, 4) + self.config_dtype() + self.index = [2] + self.index_type = "int32" + + def config_dtype(self): + self.x_type = "float64" + + def init_inputs_and_outputs(self): + xnp = np.random.random(self.x_shape).astype(self.x_type) + self.inputs = { + "X": xnp, + "Index": np.array(self.index).astype(self.index_type), + } + self.outputs = {"Out": self.inputs["X"][self.inputs["Index"]]} + + +class TestGatherOp_ZeroSize2(TestGatherOp_ZeroSize): + def config(self): + self.x_shape = (10, 20) + self.config_dtype() + self.index = [2, 0] + self.index_type = "int32" if __name__ == "__main__": + paddle.enable_static() unittest.main() From 893829371efacbff859d0eb83c7ea827f5bb0124 Mon Sep 17 00:00:00 2001 From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Date: Thu, 11 Sep 2025 17:29:10 +0800 Subject: [PATCH 05/58] [Metax] update metax_gpu CMakeLists.txt (#10) * [Metax] fix dgc & mklml compile product path problem * [Metax] fix accuracy kernel & add test_accuracy_op_metax.py unit test * [Metax] add mixed_vector fix & update change patch * [Metax] update metax_gpu CMakeLists.txt --- backends/metax_gpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index 4567723123c..b22d7077e3b 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -26,11 +26,11 @@ set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}") set(WITH_MKLML ON) +include(paddle) set(THIRD_PARTY_PATH "${PADDLE_SOURCE_DIR}/build/third_party" CACHE PATH "Third party libraries directory.") -include(paddle) include(version) include(generic) include(cblas) From f54187fb3e47ed8062537b9d339c48c7fd711326 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Thu, 11 Sep 2025 18:51:43 +0800 Subject: [PATCH 06/58] [metax] updata_qr_kernel (#11) * [metax] chang patch fix copy * [metax] chang patch fix copy * [Metax] update metax_gpu unit test * [Metax] fix test CMakeList.txt * [metax]change_cupti_and_fix_softmax * [metax]change_patch * [metax]change_patch * [metax] updata_qr_kernel * [metax] updata_qr_kernel --------- Co-authored-by: Mingkun.Zhang <2496808993@qq.com> Co-authored-by: metax666 Co-authored-by: jiaxinWang-metax <189149612@qq.com> Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Co-authored-by: chezhang <1376507468@qq.com> Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com> Co-authored-by: ZhouDuan <1184319564@qq.com> --- .../metax_kernel/qr_kernel_register.cu | 207 +++++++++--------- 1 file changed, 98 insertions(+), 109 deletions(-) diff --git a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu index 7b133371f4d..745069e2eda 100644 --- a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu @@ -22,9 +22,8 @@ #include #include -#include "kernels/impl/values_vectors_functor.h" +#include "kernels/metax_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/complex.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" @@ -39,7 +38,6 @@ #include "paddle/phi/kernels/slice_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" #include "paddle/phi/kernels/tril_triu_kernel.h" - namespace phi { template @@ -358,47 +356,47 @@ void QrKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_HIP #define FUNC_WITH_TYPES(m) m(float, s) m(double, d) -#define GEQRF_BATCH_INSTANCE(T, C) \ - template <> \ - void BatchedGeqrf(const GPUContext& dev_ctx, \ - int batch_size, \ - int m, \ - int n, \ - T* a, \ - int lda, \ - T* tau, \ - int a_stride, \ - int tau_stride) { \ - auto handle = dev_ctx.cusolver_dn_handle(); \ - for (int i = 0; i < batch_size; ++i) { \ - T* a_working_ptr = &a[i * a_stride]; \ - T* tau_working_ptr = &tau[i * tau_stride]; \ - PADDLE_ENFORCE_GPU_SUCCESS(dynload::rocsolver_##C##geqrf( \ - handle, m, n, a_working_ptr, lda, tau_working_ptr)); \ - } \ +#define GEQRF_BATCH_INSTANCE(T, C) \ + template <> \ + void BatchedGeqrf(const GPUContext& dev_ctx, \ + int batch_size, \ + int m, \ + int n, \ + T* a, \ + int lda, \ + T* tau, \ + int a_stride, \ + int tau_stride) { \ + auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); \ + for (int i = 0; i < batch_size; ++i) { \ + T* a_working_ptr = &a[i * a_stride]; \ + T* tau_working_ptr = &tau[i * tau_stride]; \ + PADDLE_ENFORCE_GPU_SUCCESS(dynload::rocsolver_##C##geqrf( \ + handle, m, n, a_working_ptr, lda, tau_working_ptr)); \ + } \ } FUNC_WITH_TYPES(GEQRF_BATCH_INSTANCE); -#define ORGQR_BATCH_INSTANCE(T, C) \ - template <> \ - void BatchedOrgqr(const GPUContext& dev_ctx, \ - int batch_size, \ - int m, \ - int n, \ - int k, \ - T* a, \ - int lda, \ - T* tau, \ - int a_stride, \ - int tau_stride) { \ - auto handle = dev_ctx.cusolver_dn_handle(); \ - for (int i = 0; i < batch_size; ++i) { \ - T* a_working_ptr = &a[i * a_stride]; \ - T* tau_working_ptr = &tau[i * tau_stride]; \ - PADDLE_ENFORCE_GPU_SUCCESS(dynload::rocsolver_##C##orgqr( \ - handle, m, n, k, a_working_ptr, lda, tau_working_ptr)); \ - } \ +#define ORGQR_BATCH_INSTANCE(T, C) \ + template <> \ + void BatchedOrgqr(const GPUContext& dev_ctx, \ + int batch_size, \ + int m, \ + int n, \ + int k, \ + T* a, \ + int lda, \ + T* tau, \ + int a_stride, \ + int tau_stride) { \ + auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); \ + for (int i = 0; i < batch_size; ++i) { \ + T* a_working_ptr = &a[i * a_stride]; \ + T* tau_working_ptr = &tau[i * tau_stride]; \ + PADDLE_ENFORCE_GPU_SUCCESS(dynload::rocsolver_##C##orgqr( \ + handle, m, n, k, a_working_ptr, lda, tau_working_ptr)); \ + } \ } FUNC_WITH_TYPES(ORGQR_BATCH_INSTANCE); @@ -421,7 +419,6 @@ void BatchedGeqrf(const GPUContext& dev_ctx, const int64_t a_stride_64 = static_cast(a_stride); const int64_t tau_stride_64 = static_cast(tau_stride); - // auto handle = dev_ctx.cusolver_dn_handle(); auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); size_t workspace_in_bytes_on_device = 0; @@ -499,7 +496,6 @@ void BatchedGeqrf(const GPUContext& dev_ctx, } else { int lwork = 0; - // auto handle = dev_ctx.cusolver_dn_handle(); auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSgeqrf_bufferSize( handle, m, n, a, lda, &lwork)); @@ -555,7 +551,6 @@ void BatchedGeqrf(const GPUContext& dev_ctx, int tau_stride) { int lwork = 0; - // auto handle = dev_ctx.cusolver_dn_handle(); auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::cusolverDnDgeqrf_bufferSize(handle, m, n, a, lda, &lwork)); @@ -599,35 +594,33 @@ void BatchedGeqrf(const GPUContext& dev_ctx, } template <> -void BatchedGeqrf>( - const GPUContext& dev_ctx, - int batch_size, - int m, - int n, - phi::dtype::complex* a, - int lda, - phi::dtype::complex* tau, - int a_stride, - int tau_stride) { +void BatchedGeqrf(const GPUContext& dev_ctx, + int batch_size, + int m, + int n, + phi::complex64* a, + int lda, + phi::complex64* tau, + int a_stride, + int tau_stride) { int lwork = 0; - // auto handle = dev_ctx.cusolver_dn_handle(); auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgeqrf_bufferSize( handle, m, n, reinterpret_cast(a), lda, &lwork)); DenseTensor workspace = DenseTensor(); workspace.Resize(common::make_ddim({lwork})); - phi::dtype::complex* workspace_ptr = - dev_ctx.template Alloc>(&workspace); + phi::complex64* workspace_ptr = + dev_ctx.template Alloc(&workspace); DenseTensor info = DenseTensor(); info.Resize(common::make_ddim({1})); int* info_d = dev_ctx.template Alloc(&info); for (int i = 0; i < batch_size; ++i) { - phi::dtype::complex* a_working_ptr = &a[i * a_stride]; - phi::dtype::complex* tau_working_ptr = &tau[i * tau_stride]; + phi::complex64* a_working_ptr = &a[i * a_stride]; + phi::complex64* tau_working_ptr = &tau[i * tau_stride]; // compute geqrf PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgeqrf( handle, @@ -657,35 +650,33 @@ void BatchedGeqrf>( } template <> -void BatchedGeqrf>( - const GPUContext& dev_ctx, - int batch_size, - int m, - int n, - phi::dtype::complex* a, - int lda, - phi::dtype::complex* tau, - int a_stride, - int tau_stride) { +void BatchedGeqrf(const GPUContext& dev_ctx, + int batch_size, + int m, + int n, + phi::complex128* a, + int lda, + phi::complex128* tau, + int a_stride, + int tau_stride) { int lwork = 0; - // auto handle = dev_ctx.cusolver_dn_handle(); auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgeqrf_bufferSize( handle, m, n, reinterpret_cast(a), lda, &lwork)); DenseTensor workspace = DenseTensor(); workspace.Resize(common::make_ddim({lwork})); - phi::dtype::complex* workspace_ptr = - dev_ctx.template Alloc>(&workspace); + phi::complex128* workspace_ptr = + dev_ctx.template Alloc(&workspace); DenseTensor info = DenseTensor(); info.Resize(common::make_ddim({1})); int* info_d = dev_ctx.template Alloc(&info); for (int i = 0; i < batch_size; ++i) { - phi::dtype::complex* a_working_ptr = &a[i * a_stride]; - phi::dtype::complex* tau_working_ptr = &tau[i * tau_stride]; + phi::complex128* a_working_ptr = &a[i * a_stride]; + phi::complex128* tau_working_ptr = &tau[i * tau_stride]; // compute geqrf PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgeqrf( handle, @@ -727,7 +718,6 @@ void BatchedOrgqr(const GPUContext& dev_ctx, int tau_stride) { int lwork = 0; - // auto handle = dev_ctx.cusolver_dn_handle(); auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSorgqr_bufferSize( handle, m, n, k, a, lda, tau, &lwork)); @@ -784,7 +774,6 @@ void BatchedOrgqr(const GPUContext& dev_ctx, int tau_stride) { int lwork = 0; - // auto handle = dev_ctx.cusolver_dn_handle(); auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDorgqr_bufferSize( handle, m, n, k, a, lda, tau, &lwork)); @@ -829,20 +818,18 @@ void BatchedOrgqr(const GPUContext& dev_ctx, } template <> -void BatchedOrgqr>( - const GPUContext& dev_ctx, - int batch_size, - int m, - int n, - int k, - phi::dtype::complex* a, - int lda, - phi::dtype::complex* tau, - int a_stride, - int tau_stride) { +void BatchedOrgqr(const GPUContext& dev_ctx, + int batch_size, + int m, + int n, + int k, + phi::complex64* a, + int lda, + phi::complex64* tau, + int a_stride, + int tau_stride) { int lwork = 0; - // auto handle = dev_ctx.cusolver_dn_handle(); auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCungqr_bufferSize( handle, @@ -856,16 +843,16 @@ void BatchedOrgqr>( DenseTensor workspace = DenseTensor(); workspace.Resize(common::make_ddim({lwork})); - phi::dtype::complex* workspace_ptr = - dev_ctx.template Alloc>(&workspace); + phi::complex64* workspace_ptr = + dev_ctx.template Alloc(&workspace); DenseTensor info = DenseTensor(); info.Resize(common::make_ddim({1})); int* info_d = dev_ctx.template Alloc(&info); for (int i = 0; i < batch_size; ++i) { - phi::dtype::complex* a_working_ptr = &a[i * a_stride]; - phi::dtype::complex* tau_working_ptr = &tau[i * tau_stride]; + phi::complex64* a_working_ptr = &a[i * a_stride]; + phi::complex64* tau_working_ptr = &tau[i * tau_stride]; // compute orggr PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCungqr( handle, @@ -896,20 +883,18 @@ void BatchedOrgqr>( } template <> -void BatchedOrgqr>( - const GPUContext& dev_ctx, - int batch_size, - int m, - int n, - int k, - phi::dtype::complex* a, - int lda, - phi::dtype::complex* tau, - int a_stride, - int tau_stride) { +void BatchedOrgqr(const GPUContext& dev_ctx, + int batch_size, + int m, + int n, + int k, + phi::complex128* a, + int lda, + phi::complex128* tau, + int a_stride, + int tau_stride) { int lwork = 0; - // auto handle = dev_ctx.cusolver_dn_handle(); auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZungqr_bufferSize( handle, @@ -923,16 +908,16 @@ void BatchedOrgqr>( DenseTensor workspace = DenseTensor(); workspace.Resize(common::make_ddim({lwork})); - phi::dtype::complex* workspace_ptr = - dev_ctx.template Alloc>(&workspace); + phi::complex128* workspace_ptr = + dev_ctx.template Alloc(&workspace); DenseTensor info = DenseTensor(); info.Resize(common::make_ddim({1})); int* info_d = dev_ctx.template Alloc(&info); for (int i = 0; i < batch_size; ++i) { - phi::dtype::complex* a_working_ptr = &a[i * a_stride]; - phi::dtype::complex* tau_working_ptr = &tau[i * tau_stride]; + phi::complex128* a_working_ptr = &a[i * a_stride]; + phi::complex128* tau_working_ptr = &tau[i * tau_stride]; // compute orggr PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZungqr( handle, @@ -965,11 +950,15 @@ void BatchedOrgqr>( } // namespace phi +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(qr, GPU, ALL_LAYOUT, phi::QrKernel, float, double) {} +#else PD_REGISTER_PLUGIN_KERNEL(qr, metax_gpu, ALL_LAYOUT, phi::QrKernel, float, double, - phi::dtype::complex, - phi::dtype::complex) {} + phi::complex64, + phi::complex128) {} +#endif From 1e042162a9f7cbb4c08b260bae373122fee1e827 Mon Sep 17 00:00:00 2001 From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Date: Mon, 15 Sep 2025 10:30:01 +0800 Subject: [PATCH 07/58] [Metax] fix illegal address access error in test_momentum_op (#12) * [Metax] fix illegal address access error in test_momentum_op --- backends/metax_gpu/patch/tmp/mixed_vector.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/backends/metax_gpu/patch/tmp/mixed_vector.h b/backends/metax_gpu/patch/tmp/mixed_vector.h index e7cf1e626c9..1dcca9c71b4 100644 --- a/backends/metax_gpu/patch/tmp/mixed_vector.h +++ b/backends/metax_gpu/patch/tmp/mixed_vector.h @@ -386,7 +386,8 @@ class MixVector { // the unify method to access CPU or CUDA data. immutable. const T *Data(phi::Place place) const { - if (place.GetType() == phi::AllocationType::GPU) { + if (place.GetType() == phi::AllocationType::GPU || + place.GetType() == phi::AllocationType::CUSTOM) { return CUDAData(place); } else { return data(); @@ -395,7 +396,8 @@ class MixVector { // the unify method to access CPU or CUDA data. mutable. T *MutableData(phi::Place place) { - if (place.GetType() == phi::AllocationType::GPU) { + if (place.GetType() == phi::AllocationType::GPU || + place.GetType() == phi::AllocationType::CUSTOM) { return CUDAMutableData(place); } else { return data(); From aca80a41f6f619d995f5944c584c3141fab3ce9e Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Mon, 15 Sep 2025 11:41:10 +0800 Subject: [PATCH 08/58] [Metax] fix cufft and fix some blas kernel apply (#13) * [Metax] fix cufft and fix some blas kernel apply --- backends/metax_gpu/CMakeLists.txt | 13 ++---- backends/metax_gpu/patch/paddle.patch | 59 +++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 9 deletions(-) diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index b22d7077e3b..6048b59e6c1 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -618,6 +618,7 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bernoulli_kernel.cu # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_grad_kernel_impl.h # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_kernel.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cufft.cc ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/box_coder_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -683,15 +684,9 @@ file( ${CMAKE_SOURCE_DIR}/kernels/flash_attn_kernel.cu ${CMAKE_SOURCE_DIR}/kernels/flashattn.cc) -list( - REMOVE_ITEM - CUDA_SRCS - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/gru_compute.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/multihead_matmul_functor.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/softmax.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/weight_only_gemv.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math/context_project.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/fft.cu) +list(REMOVE_ITEM CUDA_SRCS + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/softmax.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/weight_only_gemv.cu) file( GLOB diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index 1935217baa0..8127caee61e 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -133,6 +133,26 @@ index c0080f0a5e..458ca3e2e8 100644 } // namespace dynload } // namespace phi +diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h +index 1547909d92..66b2779392 100644 +--- a/paddle/phi/backends/dynload/cufft.h ++++ b/paddle/phi/backends/dynload/cufft.h +@@ -1,3 +1,4 @@ ++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. + /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); +@@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name); + cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle(); \ + }); \ + EnforceCUFFTLoaded(#__name); \ +- static void* p_##__name = dlsym(cufft_dso_handle, #__name); \ ++ std::string replaced_name = #__name; \ ++ replaced_name = replaced_name.replace(0,2,"mc"); \ ++ static void* p_##__name = dlsym(cufft_dso_handle, replaced_name.c_str()); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ diff --git a/paddle/phi/backends/dynload/cupti.h b/paddle/phi/backends/dynload/cupti.h index 59e92955c9..d2f8c2da15 100644 --- a/paddle/phi/backends/dynload/cupti.h @@ -437,6 +457,32 @@ index cb35feee32..64f5bd24ac 100644 #include "paddle/phi/kernels/funcs/quant_dequant.h" #include "paddle/phi/kernels/matmul_kernel.h" +diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu +index 88663ec880..98b93072a3 100644 +--- a/paddle/phi/kernels/funcs/gru_compute.cu ++++ b/paddle/phi/kernels/funcs/gru_compute.cu +@@ -12,7 +12,7 @@ limitations under the License. */ + #include "paddle/phi/kernels/funcs/gru_compute.h" + + #include "paddle/phi/backends/gpu/gpu_context.h" +-#include "paddle/phi/kernels/funcs/blas/blas.h" ++#include "kernels/funcs/blas/blas.h" + #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h" + #include "paddle/phi/kernels/funcs/detail/gru_kernel.h" + +diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h +index 15e1a4a3c3..e4780538d7 100644 +--- a/paddle/phi/kernels/funcs/math/context_project.h ++++ b/paddle/phi/kernels/funcs/math/context_project.h +@@ -18,7 +18,7 @@ + #include + + #include "paddle/phi/core/tensor_utils.h" +-#include "paddle/phi/kernels/funcs/blas/blas.h" ++#include "kernels/funcs/blas/blas.h" + #include "paddle/phi/kernels/funcs/im2col.h" + + namespace phi { diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu index e101224970..a52eb6096f 100644 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu @@ -469,6 +515,19 @@ index 558d363b39..05da04b517 100644 #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/scatter.cu.h" +diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu +index 8b0baf5f5f..260482f124 100644 +--- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu ++++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu +@@ -27,7 +27,7 @@ namespace cub = hipcub; + + #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h" + +-#include "paddle/phi/kernels/funcs/blas/blas.h" ++#include "kernels/funcs/blas/blas.h" + #include "paddle/phi/kernels/funcs/math_cuda_utils.h" + + namespace phi { diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h index e30d440ff3..3c74792690 100644 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h From fb547db298546f2c3249e22886c2232ba4882987 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Mon, 15 Sep 2025 16:04:35 +0800 Subject: [PATCH 09/58] [metax] add warpctc_warprnn (#14) * [metax] fix bug --- backends/metax_gpu/CMakeLists.txt | 2 + backends/metax_gpu/change_patch.sh | 1 + backends/metax_gpu/cmake/warpctc.cmake | 149 ++++++ backends/metax_gpu/cmake/warprnnt.cmake | 142 ++++++ .../warpctc_grad_kernel_register.cu | 2 +- .../cuda_kernels/warpctc_kernel_register.cu | 2 +- .../kernels/impl/warpctc_kernel_impl.h | 3 +- .../kernels/impl/warprnnt_kernel_impl.h | 6 +- backends/metax_gpu/patch/intrinsics.cuh | 459 ++++++++++++++++++ backends/metax_gpu/patch/paddle.patch | 26 + 10 files changed, 787 insertions(+), 5 deletions(-) create mode 100644 backends/metax_gpu/cmake/warpctc.cmake create mode 100644 backends/metax_gpu/cmake/warprnnt.cmake create mode 100644 backends/metax_gpu/patch/intrinsics.cuh diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index 6048b59e6c1..cca23ab42f5 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -37,6 +37,8 @@ include(cblas) include(flashattn) include(cutlass) include(dgc) +include(warpctc) +include(warprnnt) set(PLUGIN_VERSION ${PADDLE_VERSION}) diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh index 833ae00f6bd..60d74ec0f3d 100644 --- a/backends/metax_gpu/change_patch.sh +++ b/backends/metax_gpu/change_patch.sh @@ -25,3 +25,4 @@ cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core cd ../../Paddle/ git apply --verbose ../backends/metax_gpu/patch/paddle.patch cd - +cp -r patch/intrinsics.cuh ../../Paddle/third_party/warpctc/include/contrib/moderngpu/include/device/ diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake new file mode 100644 index 00000000000..71c892a6cfa --- /dev/null +++ b/backends/metax_gpu/cmake/warpctc.cmake @@ -0,0 +1,149 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +include(ExternalProject) + +if(WITH_ROCM) + add_definitions(-DWARPCTC_WITH_HIP) +endif() + +set(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) +set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) +# in case of low internet speed set(WARPCTC_REPOSITORY +# https://gitee.com/tianjianhe/warp-ctc.git) +set(WARPCTC_TAG bdc2b4550453e0ef2d3b5190f9c6103a84eff184) +set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/warpctc) +set(WARPCTC_PATCH_COMMAND "") +set(WARPCTC_CCBIN_OPTION "") +if(WIN32) + set(WARPCTC_PATCH_CUDA_COMMAND + git checkout -- . && git checkout ${WARPCTC_TAG} && git apply + ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.cuda.patch) +else() + set(WARPCTC_PATCH_CUDA_COMMAND + git checkout -- . && git checkout ${WARPCTC_TAG} && patch -Nd + ${SOURCE_DIR} < + ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.cuda.patch) +endif() + +if(NOT WIN32 AND WITH_GPU) + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0 AND ${CMAKE_CXX_COMPILER_VERSION} + VERSION_GREATER 12.0) + file(TO_NATIVE_PATH + ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.patch native_src) + set(WARPCTC_PATCH_COMMAND git checkout -- . && git checkout ${WARPCTC_TAG} + && patch -Nd ${SOURCE_DIR} < ${native_src} &&) + set(WARPCTC_CCBIN_OPTION -DCCBIN_COMPILER=${CCBIN_COMPILER}) + endif() +endif() + +if(WITH_ROCM) + set(WARPCTC_PATHCH_ROCM_COMMAND + patch -p1 < + ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.rocm.patch && patch + -p1 < ${PADDLE_SOURCE_DIR}/patches/warpctc/devicetypes.cuh.patch && patch + -p1 < ${PADDLE_SOURCE_DIR}/patches/warpctc/hip.cmake.patch) +endif() + +set(WARPCTC_INCLUDE_DIR + "${WARPCTC_INSTALL_DIR}/include" + CACHE PATH "Warp-ctc Directory" FORCE) +# Used in unit test test_WarpCTCLayer +set(WARPCTC_LIB_DIR + "${WARPCTC_INSTALL_DIR}/lib" + CACHE PATH "Warp-ctc Library Directory" FORCE) + +if(WIN32) + set(WARPCTC_LIBRARIES + "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +else() + set(WARPCTC_LIBRARIES + "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +endif() + +if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" + OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" + OR WIN32) + set(USE_OMP OFF) +else() + set(USE_OMP ON) +endif() + +if(WIN32) + set(WARPCTC_C_FLAGS $) + set(WARPCTC_C_FLAGS_DEBUG $) + set(WARPCTC_C_FLAGS_RELEASE + $) + set(WARPCTC_CXX_FLAGS $) + set(WARPCTC_CXX_FLAGS_RELEASE + $) + set(WARPCTC_CXX_FLAGS_DEBUG + $) +else() + set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS}) + set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) + set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE}) + set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) + set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) +endif() + +ExternalProject_Add( + extern_warpctc + ${EXTERNAL_PROJECT_LOG_ARGS} + SOURCE_DIR ${SOURCE_DIR} + PREFIX ${WARPCTC_PREFIX_DIR} + UPDATE_COMMAND "" + PATCH_COMMAND + COMMAND ${WARPCTC_PATCH_COMMAND} + COMMAND ${WARPCTC_PATCH_CUDA_COMMAND} + COMMAND ${WARPCTC_PATHCH_ROCM_COMMAND} + # BUILD_ALWAYS 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_ROCM=${WITH_ROCM} + -DWITH_OMP=${USE_OMP} + -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA} + -DWITH_TORCH=OFF + -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON + -DBUILD_SHARED=ON + -DBUILD_TESTS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR} + ${EXTERNAL_OPTIONAL_ARGS} + ${WARPCTC_CCBIN_OPTION} + CMAKE_CACHE_ARGS + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} + BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES}) + +message(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") +get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY) +include_directories(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its + # headers. + +add_library(warpctc INTERFACE) +add_dependencies(warpctc extern_warpctc) diff --git a/backends/metax_gpu/cmake/warprnnt.cmake b/backends/metax_gpu/cmake/warprnnt.cmake new file mode 100644 index 00000000000..54a7ad6be86 --- /dev/null +++ b/backends/metax_gpu/cmake/warprnnt.cmake @@ -0,0 +1,142 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +include(ExternalProject) + +if(WITH_ROCM) + add_definitions(-DWARPRNNT_WITH_HIP) +endif() + +set(WARPRNNT_PREFIX_DIR ${THIRD_PARTY_PATH}/warprnnt) +set(WARPRNNT_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warprnnt) +set(WARPRNNT_TAG 7ea6bfe748779c245a0fcaa5dd9383826273eff2) +set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/warprnnt) +set(WARPRNNT_PATCH_COMMAND "") +set(WARPRNNT_CCBIN_OPTION "") +if(WIN32) + set(WARPCTC_PATCH_CUDA_COMMAND + ${CMAKE_COMMAND} -E copy_if_different + ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.cuda.patch + "/") +else() + set(WARPCTC_PATCH_CUDA_COMMAND + git checkout -- . && git checkout ${WARPRNNT_TAG} && patch -Nd + ${SOURCE_DIR} < + ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.cuda.patch) +endif() +if(WITH_ROCM) + set(WARPRNNT_PATCH_ROCM_COMMAND + patch -p1 < + ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.rocm.patch) +endif() +if(NOT WIN32 AND WITH_GPU) + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0 AND ${CMAKE_CXX_COMPILER_VERSION} + VERSION_GREATER 12.0) + file(TO_NATIVE_PATH + ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.patch native_src) + set(WARPRNNT_PATCH_COMMAND + git checkout -- . && git checkout ${WARPRNNT_TAG} && patch -Nd + ${SOURCE_DIR} < ${native_src}) + set(WARPRNNT_CCBIN_OPTION -DCCBIN_COMPILER=${CCBIN_COMPILER}) + endif() +endif() + +set(WARPRNNT_INCLUDE_DIR + "${WARPRNNT_INSTALL_DIR}/include" + CACHE PATH "Warp-rnnt Directory" FORCE) +# Used in unit test test_WarpCTCLayer +set(WARPRNNT_LIB_DIR + "${WARPRNNT_INSTALL_DIR}/lib" + CACHE PATH "Warp-rnnt Library Directory" FORCE) + +if(WIN32) + set(WARPRNNT_LIBRARIES + "${WARPRNNT_INSTALL_DIR}/bin/warprnnt${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-rnnt Library" FORCE) +else() + set(WARPRNNT_LIBRARIES + "${WARPRNNT_INSTALL_DIR}/lib/libwarprnnt${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-rnnt Library" FORCE) +endif() + +if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" + OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" + OR WIN32) + set(USE_OMP OFF) +else() + set(USE_OMP ON) +endif() + +if(WIN32) + set(WARPRNNT_C_FLAGS $) + set(WARPRNNT_C_FLAGS_DEBUG + $) + set(WARPRNNT_C_FLAGS_RELEASE + $) + set(WARPRNNT_CXX_FLAGS $) + set(WARPRNNT_CXX_FLAGS_RELEASE + $) + set(WARPRNNT_CXX_FLAGS_DEBUG + $) +else() + set(WARPRNNT_C_FLAGS ${CMAKE_C_FLAGS}) + set(WARPRNNT_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) + set(WARPRNNT_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE}) + set(WARPRNNT_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + set(WARPRNNT_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) + set(WARPRNNT_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) +endif() +ExternalProject_Add( + extern_warprnnt + ${EXTERNAL_PROJECT_LOG_ARGS} + SOURCE_DIR ${SOURCE_DIR} + PREFIX ${WARPRNNT_PREFIX_DIR} + UPDATE_COMMAND "" + PATCH_COMMAND + COMMAND ${WARPCTC_PATCH_CUDA_COMMAND} + COMMAND ${WARPRNNT_PATCH_ROCM_COMMAND} + # BUILD_ALWAYS 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${WARPRNNT_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${WARPRNNT_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${WARPRNNT_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${WARPRNNT_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${WARPRNNT_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${WARPRNNT_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${WARPRNNT_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_ROCM=${WITH_ROCM} + -DWITH_OMP=${USE_OMP} + -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA} + -DBUILD_SHARED=ON + -DBUILD_TESTS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + ${WARPCTC_CCBIN_OPTION} + CMAKE_CACHE_ARGS + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPRNNT_INSTALL_DIR} + BUILD_BYPRODUCTS ${WARPRNNT_LIBRARIES}) + +message(STATUS "warp-rnnt library: ${WARPRNNT_LIBRARIES}") +get_filename_component(WARPRNNT_LIBRARY_PATH ${WARPRNNT_LIBRARIES} DIRECTORY) +include_directories(${WARPRNNT_INCLUDE_DIR}) # For warprnnt code to include its + # headers. + +add_library(warprnnt INTERFACE) +# set_property(TARGET warprnnt PROPERTY IMPORTED_LOCATION ${WARPRNNT_LIBRARIES}) +add_dependencies(warprnnt extern_warprnnt) diff --git a/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu index e77a29d12fe..d02f805a671 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu @@ -17,7 +17,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/warpctc_grad_kernel.h" -PD_REGISTER_PLUGIN_KERNEL(warpctc_grad, +PD_CUSTOM_KERNEL_REGISTER(warpctc_grad, metax_gpu, ALL_LAYOUT, phi::WarpctcGradKernel, diff --git a/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu index 5b343506cad..c488e23fba9 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu @@ -17,5 +17,5 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/warpctc_kernel.h" -PD_REGISTER_PLUGIN_KERNEL( +PD_CUSTOM_KERNEL_REGISTER( warpctc, metax_gpu, ALL_LAYOUT, phi::WarpctcKernel, float, double) {} diff --git a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h index eb64f21c90f..9794ba1b3c0 100644 --- a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h @@ -204,7 +204,8 @@ class WarpCTCFunctor { void init(const Context& dev_ctx, const size_t blank) { warpctc_version_ = phi::dynload::get_warpctc_version(); - if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { + if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || + dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) options_.loc = CTC_GPU; options_.stream = diff --git a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h index 96e756b16b1..bb4311f5912 100644 --- a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h @@ -138,7 +138,8 @@ class WarpRNNTFunctor { // There is no memory allocated operations within warp-rnnt. rnntStatus_t status = RNNT_STATUS_UNKNOWN_ERROR; bool gpu = false; - if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { + if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || + dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpu = true; #else @@ -207,7 +208,8 @@ class WarpRNNTFunctor { options_.fastemit_lambda = fastemit_lambda; options_.batch_first = true; - if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { + if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || + dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) options_.loc = RNNT_GPU; options_.stream = diff --git a/backends/metax_gpu/patch/intrinsics.cuh b/backends/metax_gpu/patch/intrinsics.cuh new file mode 100644 index 00000000000..71365b6577c --- /dev/null +++ b/backends/metax_gpu/patch/intrinsics.cuh @@ -0,0 +1,459 @@ +/****************************************************************************** + * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * + * Code and text by Sean Baxter, NVIDIA Research + * See http://nvlabs.github.io/moderngpu for repository and documentation. + * + ******************************************************************************/ + +#include "devicetypes.cuh" + +#pragma once + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" + +namespace mgpu { + +MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) { + return *reinterpret_cast(&x); +} +MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) { + return *reinterpret_cast(&x); +} + +MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) { + return *reinterpret_cast(&x); +} +MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) { + return *reinterpret_cast(&x); +} + +MGPU_HOST_DEVICE int2 double_as_int2(double x) { + return *reinterpret_cast(&x); +} +MGPU_HOST_DEVICE double int2_as_double(int2 x) { + return *reinterpret_cast(&x); +} + +MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) { + reinterpret_cast(&d)[0] = x; +} +MGPU_HOST_DEVICE int GetDoubleX(double d) { + return double_as_int2(d).x; +} +MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) { + reinterpret_cast(&d)[1] = y; +} +MGPU_HOST_DEVICE int GetDoubleY(double d) { + return double_as_int2(d).y; +} + + +//////////////////////////////////////////////////////////////////////////////// +// PTX for bfe and bfi + +#if __CUDA_ARCH__ >= 200 + +MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) { + uint result; + asm("bfe.u32 %0, %1, %2, %3;" : + "=r"(result) : "r"(x), "r"(bit), "r"(numBits)); + return result; +} + + +MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) { + uint result; + asm("bfi.b32 %0, %1, %2, %3, %4;" : + "=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits)); + return result; +} + +MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) { + uint ret; + asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); + return ret; +} + +#endif // __CUDA_ARCH__ >= 200 + + +//////////////////////////////////////////////////////////////////////////////// +// shfl_up + +__device__ __forceinline__ float shfl_up(float var, + unsigned int delta, int width = 32) { + +#if __CUDA_ARCH__ >= 300 +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + var = __shfl_up_sync(0xFFFFFFFF, var, delta, width); +#else + var = __shfl_up(var, delta, width); +#endif +#endif + return var; +} + +__device__ __forceinline__ double shfl_up(double var, + unsigned int delta, int width = 32) { + +#if __CUDA_ARCH__ >= 300 + int2 p = mgpu::double_as_int2(var); +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width); + p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width); +#else + p.x = __shfl_up(p.x, delta, width); + p.y = __shfl_up(p.y, delta, width); +#endif + var = mgpu::int2_as_double(p); +#endif + + return var; +} + +//////////////////////////////////////////////////////////////////////////////// +// shfl_add + +// MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) { +// int result = 0; +// #if __CUDA_ARCH__ >= 300 +// int mask = (WARP_SIZE - width)<< 8; +// #if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) +// asm( +// "{.reg .s32 r0;" +// ".reg .pred p;" +// "shfl.up.sync.b32 r0|p, %1, %2, %3, 0xFFFFFFFF;" +// "@p add.s32 r0, r0, %4;" +// "mov.s32 %0, r0; }" +// : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); +// #else +// asm( +// "{.reg .s32 r0;" +// ".reg .pred p;" +// "shfl.up.b32 r0|p, %1, %2, %3;" +// "@p add.s32 r0, r0, %4;" +// "mov.s32 %0, r0; }" +// : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); +// #endif +// #endif +// return result; +// } + +MGPU_DEVICE int shfl_add(int x, int offset, int width = 32) +{ +#if __CUDA_ARCH__ >= 300 + unsigned fullMask = 0xffffffffU; + unsigned mask = (width == 32) ? fullMask : ((1U << width) - 1U); + int src = 0; +#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 9 + src = __shfl_up_sync(mask, x, offset, width); // CUDA 9+ +#else + src = __shfl_up(x, offset, width); // CUDA 8- +#endif + int lane = threadIdx.x & 31; + return (lane >= offset) ? (src + x) : x; +#else + return x; +#endif +} + +MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) { + int result = 0; +#if __CUDA_ARCH__ >= 300 + int mask = (WARP_SIZE - width)<< 8; +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + asm( + "{.reg .s32 r0;" + ".reg .pred p;" + "shfl.up.sync.b32 r0|p, %1, %2, %3, 0xFFFFFFFF;" + "@p max.s32 r0, r0, %4;" + "mov.s32 %0, r0; }" + : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); +#else + asm( + "{.reg .s32 r0;" + ".reg .pred p;" + "shfl.up.b32 r0|p, %1, %2, %3;" + "@p max.s32 r0, r0, %4;" + "mov.s32 %0, r0; }" + : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); +#endif +#endif + return result; +} + +//////////////////////////////////////////////////////////////////////////////// +// brev, popc, clz, bfe, bfi, prmt + +// Reverse the bits in an integer. +MGPU_HOST_DEVICE uint brev(uint x) { +#if __CUDA_ARCH__ >= 200 + uint y = __brev(x); +#else + uint y = 0; + for(int i = 0; i < 32; ++i) + y |= (1 & (x>> i))<< (31 - i); +#endif + return y; +} + +// Count number of bits in a register. +MGPU_HOST_DEVICE int popc(uint x) { +#if __CUDA_ARCH__ >= 200 + return __popc(x); +#else + int c; + for(c = 0; x; ++c) + x &= x - 1; + return c; +#endif +} + +// Count leading zeros - start from most significant bit. +MGPU_HOST_DEVICE int clz(int x) { +#if __CUDA_ARCH__ >= 200 + return __clz(x); +#else + for(int i = 31; i >= 0; --i) + if((1<< i) & x) return 31 - i; + return 32; +#endif +} + +// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0. +MGPU_HOST_DEVICE int ffs(int x) { +#if __CUDA_ARCH__ >= 200 + return __ffs(x); +#else + for(int i = 0; i < 32; ++i) + if((1<< i) & x) return i + 1; + return 0; +#endif +} + +MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) { +#if __CUDA_ARCH__ >= 200 + return bfe_ptx(x, bit, numBits); +#else + return ((1<< numBits) - 1) & (x>> bit); +#endif +} + +MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) { + uint result; +#if __CUDA_ARCH__ >= 200 + result = bfi_ptx(x, y, bit, numBits); +#else + if(bit + numBits > 32) numBits = 32 - bit; + uint mask = ((1<< numBits) - 1)<< bit; + result = y & ~mask; + result |= mask & (x<< bit); +#endif + return result; +} + +MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) { + uint result; +#if __CUDA_ARCH__ >= 200 + result = prmt_ptx(a, b, index); +#else + result = 0; + for(int i = 0; i < 4; ++i) { + uint sel = 0xf & (index>> (4 * i)); + uint x = ((7 & sel) > 3) ? b : a; + x = 0xff & (x>> (8 * (3 & sel))); + if(8 & sel) x = (128 & x) ? 0xff : 0; + result |= x<< (8 * i); + } +#endif + return result; +} + +// Find log2(x) and optionally round up to the next integer logarithm. +MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) { + int a = 31 - clz(x); + if(roundUp) a += !MGPU_IS_POW_2(x); + return a; +} + +//////////////////////////////////////////////////////////////////////////////// +// vset4 + +#if __CUDA_ARCH__ >= 300 + +// Performs four byte-wise comparisons and returns 1 for each byte that +// satisfies the conditional, and zero otherwise. +MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) { + uint result; + asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" : + "=r"(result) : "r"(a), "r"(b), "r"(c)); + return result; +} +MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) { + uint result; + asm("vset4.u32.u32.eq %0, %1, %2, %3;" : + "=r"(result) : "r"(a), "r"(b), "r"(0)); + return result; +} +#endif // __CUDA_ARCH__ >= 300 + +MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) { + uint result; +#if __CUDA_ARCH__ >= 300 + result = vset4_lt_add_ptx(a, b, c); +#else + result = c; + if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001; + if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100; + if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000; + if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000; +#endif + return result; +} + +MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) { + uint result; +#if __CUDA_ARCH__ >= 300 + result = vset4_eq_ptx(a, b); +#else + result = 0; + if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001; + if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100; + if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000; + if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000; +#endif + return result; +} + +//////////////////////////////////////////////////////////////////////////////// +// + +MGPU_HOST_DEVICE uint umulhi(uint x, uint y) { +#if __CUDA_ARCH__ >= 100 + return __umulhi(x, y); +#else + uint64 product = (uint64)x * y; + return (uint)(product>> 32); +#endif +} + +//////////////////////////////////////////////////////////////////////////////// +// ldg() function defined for all devices and all types. Only compiles to __ldg +// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported +// by __ldg in sm_32_intrinsics.h + +template +struct IsLdgType { + enum { value = false }; +}; +#define DEFINE_LDG_TYPE(T) \ + template<> struct IsLdgType { enum { value = true }; }; + +template::value> +struct LdgShim { + MGPU_DEVICE static T Ldg(const T* p) { + return *p; + } +}; + +#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 + + // List of __ldg-compatible types from sm_32_intrinsics.h. + DEFINE_LDG_TYPE(char) + DEFINE_LDG_TYPE(short) + DEFINE_LDG_TYPE(int) + DEFINE_LDG_TYPE(long long) + DEFINE_LDG_TYPE(char2) + DEFINE_LDG_TYPE(char4) + DEFINE_LDG_TYPE(short2) + DEFINE_LDG_TYPE(short4) + DEFINE_LDG_TYPE(int2) + DEFINE_LDG_TYPE(int4) + DEFINE_LDG_TYPE(longlong2) + + DEFINE_LDG_TYPE(unsigned char) + DEFINE_LDG_TYPE(unsigned short) + DEFINE_LDG_TYPE(unsigned int) + DEFINE_LDG_TYPE(unsigned long long) + DEFINE_LDG_TYPE(uchar2) + DEFINE_LDG_TYPE(uchar4) + DEFINE_LDG_TYPE(ushort2) + DEFINE_LDG_TYPE(ushort4) + DEFINE_LDG_TYPE(uint2) + DEFINE_LDG_TYPE(uint4) + DEFINE_LDG_TYPE(ulonglong2) + + DEFINE_LDG_TYPE(float) + DEFINE_LDG_TYPE(double) + DEFINE_LDG_TYPE(float2) + DEFINE_LDG_TYPE(float4) + DEFINE_LDG_TYPE(double2) + + template struct LdgShim { + MGPU_DEVICE static T Ldg(const T* p) { + return __ldg(p); + } + }; +#endif + +template +MGPU_DEVICE T ldg(const T* p) { + return LdgShim::Ldg(p); +} + +//////////////////////////////////////////////////////////////////////////////// + +// Fast division for 31-bit integers. +// Uses the method in Hacker's Delight (2nd edition) page 228. +// Evaluates for denom > 1 and x < 2^31. +struct FastDivide { + uint denom; + uint coef; + uint shift; + + MGPU_HOST_DEVICE uint Divide(uint x) { + return umulhi(x, coef)>> shift; + } + MGPU_HOST_DEVICE uint Modulus(uint x) { + return x - Divide(x) * denom; + } + + explicit FastDivide(uint denom_) { + denom = denom_; + uint p = 31 + FindLog2(denom, true); + coef = (uint)(((1ull<< p) + denom - 1) / denom); + shift = p - 32; + } +}; + +#pragma GCC diagnostic pop + +} // namespace mgpu diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index 8127caee61e..0283a443adb 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -1087,6 +1087,32 @@ index 6f03f76eeb..5fe2c3e7dc 100644 #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" +diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h +index 7b85903776..3f4b298807 100644 +--- a/paddle/phi/kernels/impl/merged_momentum_impl.h ++++ b/paddle/phi/kernels/impl/merged_momentum_impl.h +@@ -297,7 +297,7 @@ void MergedMomentumInnerCompute( + params_out[idx], + velocities_out[idx]); + VLOG(10) << "Launch MergedMomentum cpu kernel."; +- } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { ++ } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) { + phi::funcs::ForRange for_range( + static_cast(dev_ctx), params[idx]->numel()); + const auto grad_type = grads[idx]->dtype(); +diff --git a/paddle/phi/kernels/impl/momentum_kernel_impl.h b/paddle/phi/kernels/impl/momentum_kernel_impl.h +index de5bcfc30b..eb2a9714f5 100644 +--- a/paddle/phi/kernels/impl/momentum_kernel_impl.h ++++ b/paddle/phi/kernels/impl/momentum_kernel_impl.h +@@ -457,7 +457,7 @@ void MomentumDenseImpl(const Context& dev_ctx, + regularization_coeff, + param_out, + velocity_out); +- } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { ++ } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) { + funcs::ForRange for_range(dev_ctx, param.numel()); + const auto grad_type = grad.dtype(); + #define PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(__nesterov, __reg_type) \ diff --git a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h index 4099d8b506..baef2cd643 100644 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h From 8e981985c3b9f2e6bfc3789d92b48fed42abace1 Mon Sep 17 00:00:00 2001 From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Date: Mon, 15 Sep 2025 17:40:04 +0800 Subject: [PATCH 10/58] [Metax] update metax CI (#15) * [Metax] update metax CI --- backends/metax_gpu/tests/CMakeLists.txt | 100 ++++- .../check_diff_metax_legacy_unit_test.sh | 108 +++++ .../tests/unit_test/test_abs_metax.py | 39 ++ .../tests/unit_test/test_arange_metax.py | 260 ++++++++++++ .../test_bfloat16_embedding_metax.py | 72 ++++ .../unit_test/test_count_nonzero_api_metax.py | 81 ++++ .../unit_test/test_gaussian_nll_loss_metax.py | 208 +++++++++ .../tests/unit_test/test_greater_equal.py | 44 ++ ...bate_build_src_rank_and_local_expert_id.py | 62 +++ ...test_incubate_expand_modality_expert_id.py | 172 ++++++++ .../test_incubate_fused_rmsnorm_ext_metax.py | 95 +++++ .../unit_test/test_incubate_moe_combine.py | 193 +++++++++ ...moe_gate_dispatch_partial_nosoftmaxtopk.py | 218 ++++++++++ ...st_incubate_moe_gate_dispatch_w_permute.py | 207 +++++++++ ...ncubate_moe_gate_dispatch_w_permute_bwd.py | 175 ++++++++ .../tests/unit_test/test_layer_norm.py | 358 ++++++++++++++++ .../tests/unit_test/test_matmul_op__metax.py | 395 ++++++++++++++++++ .../tests/unit_test/test_nonzero_api_metax.py | 220 ++++++++++ .../tests/unit_test/test_p_norm_op_metax.py | 215 ++++++++++ .../tests/unit_test/test_squeeze_op_metax.py | 125 ++++++ .../tests/unit_test/test_swiglu_metax.py | 295 +++++++++++++ .../tests/unit_test/test_top_p_sampling.py | 162 +++++++ .../unit_test/test_unsqueeze_op_metax.py | 98 +++++ 23 files changed, 3894 insertions(+), 8 deletions(-) create mode 100644 backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh create mode 100644 backends/metax_gpu/tests/unit_test/test_abs_metax.py create mode 100644 backends/metax_gpu/tests/unit_test/test_arange_metax.py create mode 100644 backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py create mode 100644 backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py create mode 100644 backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py create mode 100644 backends/metax_gpu/tests/unit_test/test_greater_equal.py create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py create mode 100644 backends/metax_gpu/tests/unit_test/test_layer_norm.py create mode 100644 backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py create mode 100644 backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py create mode 100644 backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py create mode 100644 backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py create mode 100644 backends/metax_gpu/tests/unit_test/test_swiglu_metax.py create mode 100644 backends/metax_gpu/tests/unit_test/test_top_p_sampling.py create mode 100644 backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt index d2e92f209ab..7e549ef4eaa 100755 --- a/backends/metax_gpu/tests/CMakeLists.txt +++ b/backends/metax_gpu/tests/CMakeLists.txt @@ -5,22 +5,106 @@ enable_testing() find_package(Python REQUIRED COMPONENTS Interpreter) -file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "unittest/*.py") +set(PADDLE_LEGACY_TEST_PATH + ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test) +set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test) + +file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py") list( APPEND PYTHON_TEST_SCRIPTS - ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test/test_tril_triu_op.py -) + ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_where_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_split_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_fill_constant_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_empty_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_sign_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_unbind_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_put_along_axis_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_maximum_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_strided_slice_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_set_value_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_flatten_contiguous_range_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_top_k_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_subtract_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_greater_equal_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_top_k_v2_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_one_hot_v2_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_fill_any_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_reshape_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_bitwise_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_pad_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_zeros_like_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_shape_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_bincount_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_assign_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_fused_bias_act_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_adamw_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_gather_nd_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_concat_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_scatter_nd_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_floordiv_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_mul_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_numel_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_scale_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_full_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_reduce_op.py) list( REMOVE_ITEM PYTHON_TEST_SCRIPTS - ${CMAKE_CURRENT_LIST_DIR}/unittest/test_cumsum_op_metax.py - ${CMAKE_CURRENT_LIST_DIR}/unittest/test_softmax_with_cross_entropy_op_metax.py - ${CMAKE_CURRENT_LIST_DIR}/unittest/test_expand_v2_op_metax.py - ${CMAKE_CURRENT_LIST_DIR}/unittest/test_tril_triu_op_metax.py - ${CMAKE_CURRENT_LIST_DIR}/unittest/test_squared_l2_norm_op_metax.py) + ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py) list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS) foreach(test_script ${PYTHON_TEST_SCRIPTS}) diff --git a/backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh b/backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh new file mode 100644 index 00000000000..86bfcb08f86 --- /dev/null +++ b/backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +SOURCE_DIR="backends/metax_gpu/tests/unittest" +SEARCH_DIR="Paddle/test/legacy_test" +PREFIX_FILE="metax_prefixes.txt" +UNMATCHED_FILE="unmatched_files.txt" +EXIST_FILE="existing_files.txt" +MISS_FILE="missing_files.txt" + +# 检查源路径是否存在 +if [ ! -d "$SOURCE_DIR" ]; then + echo "错误: 源路径 '$SOURCE_DIR' 不存在或不是一个目录" + exit 1 +fi + +# 检查搜索路径是否存在 +if [ ! -d "$SEARCH_DIR" ]; then + echo "错误: 搜索路径 '$SEARCH_DIR' 不存在或不是一个目录" + exit 1 +fi + +# 第一步:提取前缀(根据新规则处理) +echo "第一步:从 '$SOURCE_DIR' 提取文件前缀(按_op/_metax规则)..." +> "$PREFIX_FILE" # 清空前缀文件 +> "$UNMATCHED_FILE" # 清空未匹配文件列表 + +find "$SOURCE_DIR" -type f -name "*.py" | while read -r file; do + filename=$(basename "$file") + prefix="" + + # 规则1:如果包含_op关键字,提取_op前的所有字符 + if [[ "$filename" == *"_op"* ]]; then + prefix="${filename%%_op*}" + echo "提取前缀(_op规则): $prefix (来自 $filename)" + echo "$prefix" >> "$PREFIX_FILE" + + # 规则2:如果没有_op但有_metax,提取_metax前的所有字符 + elif [[ "$filename" == *"_metax"* ]]; then + prefix="${filename%%_metax*}" + echo "提取前缀(_metax规则): $prefix (来自 $filename)" + echo "$prefix" >> "$PREFIX_FILE" + + # 规则3:都不包含,归类到未匹配 + else + echo "未匹配的文件: $filename(不包含_op和_metax)" + echo "$filename" >> "$UNMATCHED_FILE" + fi +done + +# 检查是否有提取到前缀或未匹配文件 +prefix_count=$(wc -l < "$PREFIX_FILE") +unmatched_count=$(wc -l < "$UNMATCHED_FILE") + +echo "提取完成 - 有效前缀: $prefix_count 个,未匹配文件: $unmatched_count 个" + +if [ $prefix_count -eq 0 ] && [ $unmatched_count -eq 0 ]; then + echo "警告: 在 '$SOURCE_DIR' 中未找到任何以 '_metax.py' 结尾的文件" + exit 0 +fi + +# 第二步:在搜索路径中查找同名文件(仅搜索当前目录,不包括子文件夹) +echo -e "\n第二步:在 '$SEARCH_DIR' 中搜索同名文件(深度为1)..." +> "$EXIST_FILE" # 清空存在文件列表 +> "$MISS_FILE" # 清空缺失文件列表 + +# 逐个处理每个前缀 +while read -r prefix; do + # 跳过空行 + if [ -z "$prefix" ]; then + continue + fi + + # 只在搜索路径的直接目录下查找(深度为1) + found=$(find "$SEARCH_DIR" -maxdepth 1 -type f -name "${prefix}_op.py" -print -quit) + + if [ -n "$found" ]; then + echo "$prefix -> 找到文件: $found" + echo "${prefix}_op.py" >> "$EXIST_FILE" + else + echo "$prefix -> 未找到同名文件" + echo "$prefix" >> "$MISS_FILE" + fi +done < "$PREFIX_FILE" + +# 输出结果统计 +exist_count=$(wc -l < "$EXIST_FILE") +miss_count=$(wc -l < "$MISS_FILE") + +echo -e "\n处理完成!" +echo "找到同名文件的前缀数量: $exist_count(已保存到 $EXIST_FILE)" +echo "未找到同名文件的前缀数量: $miss_count(已保存到 $MISS_FILE)" +echo "未匹配规则的文件数量: $unmatched_count(已保存到 $UNMATCHED_FILE)" diff --git a/backends/metax_gpu/tests/unit_test/test_abs_metax.py b/backends/metax_gpu/tests/unit_test/test_abs_metax.py new file mode 100644 index 00000000000..0dae6822bba --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_abs_metax.py @@ -0,0 +1,39 @@ +# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. +# # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +import paddle.base.dygraph as dg + + +class TestAbs(unittest.TestCase): + def setUp(self): + self._dtypes = ["float32"] + self._places = [paddle.CustomPlace("metax_gpu", 0)] + + def test_all_positive(self): + for dtype in self._dtypes: + x = 1 + 10 * np.random.random([13, 3, 3]).astype(dtype) + for place in self._places: + with dg.guard(place): + y = paddle.abs(paddle.to_tensor(x)) + np.testing.assert_allclose(np.abs(x), y.numpy(), rtol=1e-05) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_arange_metax.py b/backends/metax_gpu/tests/unit_test/test_arange_metax.py new file mode 100644 index 00000000000..89308c33401 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_arange_metax.py @@ -0,0 +1,260 @@ +# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. +# # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from op_test import OpTest, convert_float_to_uint16 + +import paddle +from paddle.base import core +from paddle.static import Program, program_guard + + +def arange_wrapper(start, end, step, dtype="float32"): + return paddle.arange(start, end, step, dtype) + + +class TestArangeOp(OpTest): + def setUp(self): + self.op_type = "range" + self.init_config() + self.inputs = { + "Start": np.array([self.case[0]]).astype(self.dtype), + "End": np.array([self.case[1]]).astype(self.dtype), + "Step": np.array([self.case[2]]).astype(self.dtype), + } + + self.outputs = { + "Out": np.arange(self.case[0], self.case[1], self.case[2]).astype( + self.dtype + ) + } + + def init_config(self): + self.dtype = np.float32 + self.python_api = arange_wrapper + self.case = (0, 1, 0.2) + + def test_check_output(self): + self.check_output(check_pir=True, check_symbol_infer=False) + + +class TestFloatArangeOp(TestArangeOp): + def init_config(self): + self.dtype = np.float32 + self.python_api = paddle.arange + self.case = (0, 5, 1) + + +class TestFloat16ArangeOp(TestArangeOp): + def init_config(self): + self.dtype = np.float16 + self.python_api = paddle.arange + self.case = (0, 5, 1) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA and not support the bfloat16", +) +class TestBFloat16ArangeOp(OpTest): + def setUp(self): + self.op_type = "range" + self.init_config() + self.inputs = { + "Start": convert_float_to_uint16(self.start), + "End": convert_float_to_uint16(self.end), + "Step": convert_float_to_uint16(self.step), + } + + self.outputs = { + "Out": convert_float_to_uint16(np.arange(self.start, self.end, self.step)) + } + + def init_config(self): + self.dtype = np.uint16 + self.python_api = arange_wrapper + self.case = (0, 5, 1) + self.start = np.array([self.case[0]]).astype(np.float32) + self.end = np.array([self.case[1]]).astype(np.float32) + self.step = np.array([self.case[2]]).astype(np.float32) + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place, check_pir=True, check_symbol_infer=False) + + +class TestInt32ArangeOp(TestArangeOp): + def init_config(self): + self.dtype = np.int32 + self.python_api = paddle.arange + self.case = (0, 5, 2) + + +class TestFloat64ArangeOp(TestArangeOp): + def init_config(self): + self.dtype = np.float64 + self.python_api = paddle.arange + self.case = (10, 1, -2) + + +class TestInt64ArangeOp(TestArangeOp): + def init_config(self): + self.dtype = np.int64 + self.python_api = paddle.arange + self.case = (-1, -10, -2) + + +class TestZeroSizeArangeOp(TestArangeOp): + def init_config(self): + self.dtype = np.int32 + self.python_api = paddle.arange + self.case = (0, 0, 1) + + +class TestArangeOpError(unittest.TestCase): + def test_static_errors(self): + with program_guard(Program(), Program()): + paddle.enable_static() + self.assertRaises(TypeError, paddle.arange, 10, dtype="int8") + + +class TestArangeAPI(unittest.TestCase): + def test_out(self): + paddle.enable_static() + with paddle.static.program_guard( + paddle.static.Program(), paddle.static.Program() + ): + x1 = paddle.arange(0, 5, 1, "float32") + + place = ( + paddle.CUDAPlace(0) + if core.is_compiled_with_cuda() + else paddle.CPUPlace() + ) + exe = paddle.static.Executor(place) + out = exe.run(fetch_list=[x1]) + + expected_data = np.arange(0, 5, 1).astype(np.float32) + self.assertEqual((out == expected_data).all(), True) + self.assertListEqual(list(x1.shape), [5]) + paddle.disable_static(place) + + +class TestArangeImperative(unittest.TestCase): + def test_out(self): + place = ( + paddle.CUDAPlace(0) if core.is_compiled_with_cuda() else paddle.CPUPlace() + ) + paddle.disable_static(place) + x1 = paddle.arange(0, 5, 1) + x2 = paddle.tensor.arange(5) + x3 = paddle.tensor.creation.arange(5) + + start = paddle.to_tensor(np.array([0], "float32")) + end = paddle.to_tensor(np.array([5], "float32")) + step = paddle.to_tensor(np.array([1], "float32")) + x4 = paddle.arange(start, end, step, "int64") + + expected_data = np.arange(0, 5, 1).astype(np.int64) + for x in [x1, x2, x3, x4]: + np.testing.assert_array_equal(x.numpy(), expected_data) + + start_float = paddle.to_tensor(np.array([0.5], "float32")) + end_float = paddle.to_tensor(np.array([1.5], "float32")) + step_float = paddle.to_tensor(np.array([0.5], "float32")) + # all [start, end, step] is float + x5 = paddle.arange(start_float, end_float, step_float) + x5_expected_data = np.arange(0.5, 1.5, 0.5).astype(np.float32) + np.testing.assert_array_equal(x5.numpy(), x5_expected_data) + self.assertEqual(x5.numpy().dtype, np.float32) + + # [start, end] is float , [step] is int + x6 = paddle.arange(start_float, end_float, 1) + x6_expected_data = np.arange(0.5, 1.5, 1).astype(np.float32) + np.testing.assert_array_equal(x6.numpy(), x6_expected_data) + self.assertEqual(x6.numpy().dtype, np.float32) + + # [start] is float , [end] is int + x7 = paddle.arange(start_float, 1) + x7_expected_data = np.arange(0.5, 1).astype(np.float32) + np.testing.assert_array_equal(x7.numpy(), x7_expected_data) + self.assertEqual(x7.numpy().dtype, np.float32) + + # [start] is float + x8 = paddle.arange(start_float) + x8_expected_data = np.arange(0.5).astype(np.float32) + np.testing.assert_array_equal(x8.numpy(), x8_expected_data) + self.assertEqual(x8.numpy().dtype, np.float32) + + # [start] is int + x9 = paddle.arange(1) + x9_expected_data = np.arange(1).astype(np.int64) + np.testing.assert_array_equal(x9.numpy(), x9_expected_data) + self.assertEqual(x9.numpy().dtype, np.int64) + + # [start] is float + x10 = paddle.arange(1.0) + x10_expected_data = np.arange(1).astype(np.float32) + np.testing.assert_array_equal(x10.numpy(), x10_expected_data) + self.assertEqual(x10.numpy().dtype, np.float32) + + # [start] is np.int + x11 = paddle.arange(np.int64(10)) + x11_expected_data = np.arange(10).astype(np.int64) + np.testing.assert_array_equal(x11.numpy(), x11_expected_data) + self.assertEqual(x11.numpy().dtype, np.int64) + + # [start] is a big integer + x12 = paddle.arange( + start=0, + end=-9007199254740994, + step=-9007199254740993, + ) + + # numpy give wrong result here, so we generate 'x12_expected_data' manually + # x12_expected_data = np.arange(start=0, stop=-9007199254740994, step=-9007199254740993, dtype=np.int64) + x12_expected_data = np.array([0, -9007199254740993]) + + np.testing.assert_array_equal(x12.numpy(), x12_expected_data) + self.assertEqual(x12.numpy().dtype, np.int64) + + # [startend step>0] + x14 = paddle.arange(start=10, end=0, step=1) + + x14_expected_data = np.array([]) + np.testing.assert_array_equal(x14.numpy(), x14_expected_data) + + paddle.enable_static() + + +class TestArangeStatic(unittest.TestCase): + def test_infermeta(self): + paddle.enable_static() + x = paddle.arange(0, 1 + 0.005, 0.005) + self.assertEqual(x.shape, [201]) + paddle.disable_static() + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py b/backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py new file mode 100644 index 00000000000..f575d4eece0 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py @@ -0,0 +1,72 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +import paddle.nn.functional as F + + +class BF16EmbeddingTest(unittest.TestCase): + def setUp(self): + self.batch_size = 30 + self.vocab_size = 1024 + self.hidden_size = 512 + self.seed = 10 + + def run_main(self, dtype): + ids, weight, dout = self.gen_random() + origin_dtype = weight.dtype + weight_cast = weight.astype(dtype) + out = F.embedding(ids, weight_cast) + dout = dout.astype(out.dtype) + dweight = paddle.autograd.grad(out, weight, dout) + return ( + out.astype(origin_dtype).numpy(), + dweight[0].astype(origin_dtype).numpy(), + ) + + def gen_random(self): + np.random.seed(self.seed) + weight = np.random.random([self.vocab_size, self.hidden_size]).astype("float32") + ids = np.random.randint(low=0, high=self.vocab_size, size=[self.batch_size]) + dout = np.random.random([self.batch_size, self.hidden_size]).astype("float32") + + weight = paddle.to_tensor(weight) + weight.stop_gradient = False + ids = paddle.to_tensor(ids) + dout = paddle.to_tensor(dout) + return ids, weight, dout + + def test_main(self): + + ret1 = self.run_main("float32") + ret2 = self.run_main("bfloat16") + self.assertEqual(len(ret1), len(ret2)) + for i, (r1, r2) in enumerate(zip(ret1, ret2)): + np.testing.assert_allclose(r1, r2, atol=1e-3, rtol=1e-2) + + +class BF16EmbeddingTestOddHiddenSize(BF16EmbeddingTest): + def setUp(self): + self.batch_size = 30 + self.vocab_size = 511 + self.hidden_size = 512 + self.seed = 20 + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py b/backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py new file mode 100644 index 00000000000..57a5d0b1c97 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py @@ -0,0 +1,81 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle + +np.random.seed(10) + + +class TestCountNonzeroAPI(unittest.TestCase): + # test paddle.tensor.math.count_nonzero + + def setUp(self): + self.x_shape = [2, 3, 4, 5] + self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32) + self.place = paddle.CustomPlace("metax_gpu", 0) + + def test_api_static(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data("X", self.x_shape) + out1 = paddle.count_nonzero(x) + out2 = paddle.tensor.count_nonzero(x) + out3 = paddle.tensor.math.count_nonzero(x) + axis = np.arange(len(self.x_shape)).tolist() + out4 = paddle.count_nonzero(x, axis) + out5 = paddle.count_nonzero(x, tuple(axis)) + exe = paddle.static.Executor(self.place) + res = exe.run(feed={"X": self.x}, fetch_list=[out1, out2, out3, out4, out5]) + out_ref = np.count_nonzero(self.x) + for out in res: + np.testing.assert_allclose(out, out_ref, rtol=1e-05) + + def test_api_dygraph(self): + paddle.disable_static(self.place) + + def test_case(x, axis=None, keepdim=False): + x_tensor = paddle.to_tensor(x) + out = paddle.count_nonzero(x_tensor, axis=axis, keepdim=keepdim) + if isinstance(axis, list): + axis = tuple(axis) + if len(axis) == 0: + axis = None + + out_ref = np.count_nonzero(x, axis, keepdims=keepdim) + np.testing.assert_allclose(out.numpy(), out_ref, rtol=1e-05) + + test_case(self.x) + test_case(self.x, None) + test_case(self.x, -1) + test_case(self.x, keepdim=True) + test_case(self.x, 2, keepdim=True) + test_case(self.x, [0, 2]) + test_case(self.x, (0, 2)) + test_case(self.x, (0, 1, 3)) + test_case(self.x, [0, 1, 2, 3]) + paddle.enable_static() + + def test_errors(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data("X", [10, 12], "int32") + self.assertRaises(ValueError, paddle.count_nonzero, x, axis=10) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py b/backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py new file mode 100644 index 00000000000..73e389324f9 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py @@ -0,0 +1,208 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +import paddle.nn.functional as F +from paddle.base import core + +np.random.seed(10) + + +def ref_gaussian_nll_loss( + input, label, variance, full=False, eps=1e-6, reduction="none" +): + if variance.shape != input.shape: + if input.shape[:-1] == variance.shape: + variance = np.expand_dims(variance, -1) + elif input.shape[:-1] == variance.shape[:-1] and variance.shape[-1] == 1: + pass + else: + raise ValueError("variance is of incorrect size") + if reduction != "none" and reduction != "mean" and reduction != "sum": + raise ValueError(reduction + " is not valid") + + if np.any(variance < 0): + raise ValueError("var has negative entry/entries") + + variance = variance.copy() + variance = np.clip(variance, a_min=eps, a_max=None) + + loss = 0.5 * (np.log(variance) + (input - label) ** 2 / variance) + if full: + loss += 0.5 * np.log(2 * np.pi) + + if reduction == "none": + return loss + elif reduction == "sum": + return [np.sum(loss)] + elif reduction == "mean": + return [np.mean(loss)] + + +class TestGaussianNLLLossAPI(unittest.TestCase): + # test paddle.nn.functional.gaussian_nll_loss, paddle.nn.gaussian_nll_loss + + def setUp(self, type=None): + self.shape = [10, 2] + if type in ["float16", "float64", "int32", "int64"]: + dtype = np.dtype(type) + self.input_np = np.random.random(self.shape).astype(dtype) + self.label_np = np.random.random(self.shape).astype(dtype) + self.variance_np = np.ones(self.shape).astype(dtype) + elif type == "broadcast1": + self.shape = [10, 2, 3] + self.broadcast_shape = [10, 2] + self.input_np = np.random.random(self.shape).astype(np.float32) + self.label_np = np.random.random(self.shape).astype(np.float32) + self.variance_np = np.ones(self.broadcast_shape).astype(np.float32) + elif type == "broadcast2": + self.shape = [10, 2, 3] + self.broadcast_shape = [10, 2, 1] + self.input_np = np.random.random(self.shape).astype(np.float32) + self.label_np = np.random.random(self.shape).astype(np.float32) + self.variance_np = np.ones(self.broadcast_shape).astype(np.float32) + else: + dtype = np.dtype("float32") + self.input_np = np.random.random(self.shape).astype(dtype) + self.label_np = np.random.random(self.shape).astype(dtype) + self.variance_np = np.ones(self.shape).astype(dtype) + if type == "test_err": + self.variance_np = -np.ones(self.shape).astype(np.float32) + + self.place = ( + paddle.CUDAPlace(0) if core.is_compiled_with_cuda() else paddle.CPUPlace() + ) + + def test_dynamic_case(self, type=None, full=False, reduction="none"): + self.setUp(type) + paddle.disable_static(self.place) + + input_x = paddle.to_tensor(self.input_np) + label = paddle.to_tensor(self.label_np) + variance = paddle.to_tensor(self.variance_np) + if type in ["test_err", "int32", "int64"]: + self.assertRaises( + ValueError, + paddle.nn.functional.gaussian_nll_loss, + input=input_x, + label=label, + variance=variance, + ) + else: + out_ref = ref_gaussian_nll_loss( + self.input_np, + self.label_np, + self.variance_np, + full=full, + reduction=reduction, + ) + out1 = F.gaussian_nll_loss( + input_x, label, variance, full=full, reduction=reduction + ) + gaussian_nll_loss = paddle.nn.GaussianNLLLoss(full, reduction=reduction) + out2 = gaussian_nll_loss(input_x, label, variance) + + for r in [out1, out2]: + np.allclose(out_ref, r.numpy(), rtol=1e-5, atol=1e-5) + paddle.enable_static() + + def test_static_case(self, type=None, full=False, reduction="none"): + self.setUp(type) + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + if type in ["int32", "int64", "float64"]: + input_x = paddle.static.data("Input_x", self.shape, type) + label = paddle.static.data("Label", self.shape, type) + variance = paddle.static.data("Variance", self.shape, type) + elif type in ["broadcast1", "broadcast2"]: + input_x = paddle.static.data("Input_x", self.shape) + label = paddle.static.data("Label", self.shape) + variance = paddle.static.data("Variance", self.broadcast_shape) + else: + input_x = paddle.static.data("Input_x", self.shape, "float32") + label = paddle.static.data("Label", self.shape, "float32") + variance = paddle.static.data("Variance", self.shape, "float32") + out1 = F.gaussian_nll_loss( + input_x, label, variance, full=full, reduction=reduction + ) + gaussian_nll_loss = paddle.nn.GaussianNLLLoss(full, reduction=reduction) + out2 = gaussian_nll_loss(input_x, label, variance) + exe = paddle.static.Executor(self.place) + if type not in ["test_err", "int32", "int64"]: + out_ref = ref_gaussian_nll_loss( + self.input_np, + self.label_np, + self.variance_np, + full=full, + reduction=reduction, + ) + res = exe.run( + feed={ + "Input_x": self.input_np, + "Label": self.label_np, + "Variance": self.variance_np, + }, + fetch_list=[out1, out2], + ) + for r in res: + np.allclose(out_ref, r, rtol=1e-5, atol=1e-5) + else: + try: + res = exe.run( + feed={ + "Input_x": self.input_np, + "Label": self.label_np, + "Variance": self.variance_np, + }, + fetch_list=[out1, out2], + ) + except ValueError: + pass + + def test_api(self): + self.test_dynamic_case() + self.test_static_case() + + def test_float64(self): + self.test_dynamic_case("float64") + self.test_static_case("float64") + + def test_broadcast(self): + self.test_dynamic_case("broadcast1") + self.test_static_case("broadcast1") + + def test_broadcast_with_same_dim(self): + self.test_dynamic_case("broadcast2") + self.test_static_case("broadcast2") + + def test_reduction(self): + self.test_dynamic_case(full=True, reduction="mean") + self.test_dynamic_case(full=True, reduction="sum") + self.test_static_case(full=True, reduction="mean") + + def test_error(self): + self.test_dynamic_case("test_err") + self.test_static_case("test_err") + + def test_int(self): + self.test_dynamic_case("int64") + self.test_dynamic_case("int32") + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_greater_equal.py b/backends/metax_gpu/tests/unit_test/test_greater_equal.py new file mode 100644 index 00000000000..816d6075099 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_greater_equal.py @@ -0,0 +1,44 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +import paddle +from paddle import static + + +class Test_Greater_Equal_Op_Fp16(unittest.TestCase): + def test_api_fp16(self): + paddle.enable_static() + with static.program_guard(static.Program(), static.Program()): + label = paddle.to_tensor([3, 3], dtype="float16") + limit = paddle.to_tensor([3, 2], dtype="float16") + out = paddle.greater_equal(x=label, y=limit) + # if core.is_compiled_with_cuda(): + # place = paddle.CUDAPlace(0) + # exe = static.Executor(place) + # (res,) = exe.run(fetch_list=[out]) + # self.assertEqual((res == np.array([True, True])).all(), True) + place = paddle.CustomPlace(paddle.device.get_device().split(":")[0], 0) + exe = static.Executor(place) + (res,) = exe.run(fetch_list=[out]) + self.assertEqual((res == np.array([True, True])).all(), True) + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py new file mode 100644 index 00000000000..b4e4282c5ce --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py @@ -0,0 +1,62 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import unittest + +import numpy as np + +import paddle +from paddle.incubate.nn.functional import build_src_rank_and_local_expert_id + +logger = logging.getLogger(__name__) + + +class TestFusedCalculateAuxLoss(unittest.TestCase): + def test_build_src_rank_and_local_expert_id(self): + def orig_func(expert_num_global_list, num_local_experts): + send_rank_cpu = np.concatenate( # TOO SLOW!!! break every thing + [ + np.full([j], i // num_local_experts, dtype="int32") + for i, j in enumerate(expert_num_global_list) + ], + 0, + ) + local_expert_id_cpu = np.concatenate( + [ + np.full([j], i % num_local_experts, dtype="int32") + for i, j in enumerate(expert_num_global_list) + ], + 0, + ) + send_rank = paddle.to_tensor(send_rank_cpu) + local_expert_id = paddle.to_tensor(local_expert_id_cpu) + return send_rank, local_expert_id + + def fused_func(expert_num_global_tensor, expert_num_global, num_local_experts): + return build_src_rank_and_local_expert_id( + expert_num_global_tensor, expert_num_global, num_local_experts + ) + + expert_num_global = np.random.randint(0, 512, size=[12 * 8], dtype="int32") + expert_num_global_tensor = paddle.to_tensor(expert_num_global, dtype="int64") + + s1, l1 = orig_func(expert_num_global, 12) + s2, l2 = fused_func(expert_num_global_tensor, expert_num_global, 12) + assert ((s1 - s2) == 0).all(), (s1, s2) + assert ((l1 - l2) == 0).all(), (l1, l2) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py new file mode 100644 index 00000000000..2d5670ee739 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py @@ -0,0 +1,172 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from collections import namedtuple +from functools import partial + +from ernie_utils.moe_all_gather_layer import MOEAllGatherLayerV2 + +import paddle +import paddle.nn.functional as F +from paddle.incubate.nn.functional import expand_modality_expert_id + + +def fused_gate_logits_process_ref(self, gate_logits_lm, gate_logits_mm, token_type_ids): + """process gatelogits""" + top_k = self.k + num_expert_per_rank_per_modality = ( + gate_logits_lm.shape[-1] // self.config.moe_world_size + ) + + @paddle.no_grad() + def shift_ids(ids, modality_offset): + # 现在认为所以模态的 expert 数都一样 + rank = ids // num_expert_per_rank_per_modality + expert_id_in_rank = ids % num_expert_per_rank_per_modality + return ( + rank * (num_expert_per_rank_per_modality * 2) + + expert_id_in_rank + + modality_offset * num_expert_per_rank_per_modality + ) + + if self.group_experts: + gate_logits_lm = gate_logits_lm.reshape([gate_logits_lm.shape[0], top_k, -1]) + prob_lm = self.gate.act(gate_logits_lm) + weight_lm, expert_id_lm = prob_lm.topk(k=1, axis=-1) + weight_lm = weight_lm.reshape([gate_logits_lm.shape[0], -1]) + expert_id_lm = expert_id_lm.reshape([gate_logits_lm.shape[0], -1]) + group_size = gate_logits_lm.shape[-1] + scale = paddle.arange(0, top_k * group_size, group_size).unsqueeze(0) + expert_id_lm = expert_id_lm + scale + else: + prob_lm = self.gate.act(gate_logits_lm) + weight_lm, expert_id_lm = prob_lm.topk(k=top_k, axis=-1) + if token_type_ids is not None: + expert_id_lm = shift_ids(expert_id_lm, 0) + expert_id_lm.stop_gradient = True + lm_weight_and_expert_id = paddle.concat( + [weight_lm, expert_id_lm.astype("float32")], -1 + ) + if token_type_ids is None: + return ( + lm_weight_and_expert_id, + prob_lm.reshape([prob_lm.shape[0], -1]), + None, + ) + + prob_mm = self.gate.act(gate_logits_mm) + weight_mm, expert_id_mm = prob_mm.topk(k=top_k, axis=-1) + + expert_id_mm = shift_ids(expert_id_mm, 1) + expert_id_mm.stop_gradient = True + + mm_weight_and_expert_id = paddle.concat( + [weight_mm, expert_id_mm.astype("float32")], -1 + ) + + token_type_ids_float = token_type_ids[:, None].astype("float32") + weight_and_expert = ( + 1 - token_type_ids_float + ) * lm_weight_and_expert_id + token_type_ids_float * mm_weight_and_expert_id + return weight_and_expert, prob_lm.reshape([prob_lm.shape[0], -1]), prob_mm + + +def test_expand_modality_expert_id(): + def expand_id_one( + expert_id, + num_expert_per_modality, + k, + group_size, + modality_offset, + is_group_expert, + ): + orig_shape = expert_id.shape + expert_id = expert_id.reshape([-1]) + xid = paddle.arange(len(expert_id)) + if is_group_expert: + eid = xid % k + expert_id += eid * group_size + + rank = expert_id // num_expert_per_modality + expert_id_in_rank = expert_id % num_expert_per_modality + ret = ( + rank * (num_expert_per_modality * 2) + + expert_id_in_rank + + modality_offset * num_expert_per_modality + ) + return ret.reshape(orig_shape) + + S, E, k = 100, 24, 3 + expert_id_mm = paddle.randint(0, 12, shape=[S, k]) + num_expert_per_rank_per_modality = E // 2 // 4 + group_size = E // 2 // k + print(f"num_expert_per_rank_per_modality: {num_expert_per_rank_per_modality}") + fused = expand_modality_expert_id( + expert_id_mm, num_expert_per_rank_per_modality, group_size, 1, True + ) + + nonfused = expand_id_one( + expert_id_mm, num_expert_per_rank_per_modality, k, group_size, 1, True + ) + # num_expert_per_rank_per_modality, group_size + assert (fused == nonfused).all().item() + + Config = namedtuple("Config", ["moe_world_size"]) + Self = namedtuple( + "Self", + [ + "config", + "k", + "gate", + "group_experts", + "moe_statics", + "use_correction_bias", + ], + ) + Gate = namedtuple("Gate", ["act"]) + fake_gate = Gate(act=partial(F.softmax, axis=-1)) + fake_self = Self( + config=Config( + moe_world_size=8, + ), + k=k, + gate=fake_gate, + moe_statics=None, + use_correction_bias=False, + group_experts=True, + ) + + fake_logits = paddle.randn([S, E]) + fake_logits_mm = paddle.randn([S, E]) + token_type_ids = paddle.randint(0, 2, shape=[S]) + w_and_e, prob_lm, prob_mm = MOEAllGatherLayerV2.fused_gate_logits_process_fused( + fake_self, fake_logits, fake_logits_mm, None + ) + w_and_e_ref, prob_lm_ref, prob_mm_ref = fused_gate_logits_process_ref( + fake_self, fake_logits, fake_logits_mm, None + ) + assert (prob_lm == prob_lm_ref).all().item() + assert (w_and_e == w_and_e_ref).all().item() + w, e = w_and_e_ref.chunk(2, axis=-1) + + +class Test_expand_modality_expert_id_API(unittest.TestCase): + def test_dygraph(self): + test_expand_modality_expert_id() + + +if __name__ == "__main__": + + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py b/backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py new file mode 100644 index 00000000000..ca0a780e908 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py @@ -0,0 +1,95 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle.incubate.nn.functional import fused_rms_norm_ext + + +class TestFusedRMSNorm(unittest.TestCase): + def setUp(self): + paddle.seed(2023) + np.random.seed(2023) + + def rms_norm_reference(self, x, scale, bias=None, epsilon=1e-5): + variance = paddle.mean(paddle.square(x), axis=-1, keepdim=True) + + rms = paddle.sqrt(variance + epsilon) + y = x / rms + y = y * scale.reshape([1, -1]) + if bias is not None: + y = y + bias.reshape([1, -1]) + return y, (1.0 / rms).squeeze(-1) + + def test_2d_input(self): + rows, cols = 32, 64 + x = paddle.randn([rows, cols]) + scale = paddle.randn([cols]) + y_fused, invvar_fused = fused_rms_norm_ext(x, scale) + + y_ref, invvar_ref = self.rms_norm_reference(x, scale) + + np.testing.assert_allclose(y_fused, y_ref, rtol=1e-5, atol=1e-5) + np.testing.assert_allclose(invvar_fused, invvar_ref, rtol=1e-5, atol=1e-5) + + def test_without_bias(self): + + rows, cols = 32, 64 + x = paddle.randn([rows, cols]) + scale = paddle.randn([cols]) + + y_fused, invvar_fused = fused_rms_norm_ext(x, scale) + + y_ref, invvar_ref = self.rms_norm_reference(x, scale) + + np.testing.assert_allclose(y_fused, y_ref, rtol=1e-5, atol=1e-5) + np.testing.assert_allclose(invvar_fused, invvar_ref, rtol=1e-5, atol=1e-5) + + def test_backward(self): + + rows, cols = 16, 32 + x = paddle.randn([rows, cols], dtype="float32") + x.stop_gradient = False + scale = paddle.randn([cols], dtype="float32") + scale.stop_gradient = False + + y_fused, invvar = fused_rms_norm_ext(x, scale) + + loss = paddle.mean(y_fused) + loss.backward() + + x_grad_fused = x.grad.clone() + scale_grad_fused = scale.grad.clone() + + x.clear_gradient() + scale.clear_gradient() + + y_ref, invvar_ref = self.rms_norm_reference(x, scale) + loss_ref = paddle.mean(y_ref) + loss_ref.backward() + + x_grad_ref = x.grad + scale_grad_ref = scale.grad + + np.testing.assert_allclose(x_grad_fused, x_grad_ref, rtol=1e-4, atol=1e-4) + np.testing.assert_allclose( + scale_grad_fused, scale_grad_ref, rtol=1e-4, atol=1e-4 + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py new file mode 100644 index 00000000000..23df4e3457b --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py @@ -0,0 +1,193 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random +import unittest + +import numpy as np +from ernie_utils.moe_layer_uneven import GateCombine + +import paddle +import paddle.nn.functional as F +from paddle.incubate.nn.functional import moe_combine + +os.environ["FLAGS_flash_attn_version"] = "v1" +os.environ["FLAGS_cudnn_deterministic"] = "1" +os.environ["FLAGS_embedding_deterministic"] = "1" + + +def combining(x, combine_weights, scatter_index, hard_gate=False): + """ + Args: + x: Tensor[seq, dim] + combine_weights: [seq, k] + scatter_index: ** [seq, k] ** + + Returns: + y: Tensor[s, dim] + """ + x_gatherd = F.embedding(scatter_index, x) # [s,k,dim] + if hard_gate: + return x_gatherd.squeeze(-2) + # logger.info(f'combinning: {combine_weights}') + y = (combine_weights.unsqueeze(-1) * x_gatherd).sum(1) + # y = paddle.matmul(combine_weights.unsqueeze(1), x_gatherd).squeeze() # [s,1,k] @ [s,k,dim] -> [s,1,dim] + return y + + +def baseline_result(x_numpy, combine_weights_numpy, scatter_index_numpy, grad_numpy): + """baseline_result""" + scatter_index = paddle.to_tensor(scatter_index_numpy) + x = paddle.to_tensor(x_numpy).cast("float32") + x.stop_gradient = False + + combine_weights = paddle.to_tensor(combine_weights_numpy).cast("float32") + combine_weights.stop_gradient = False + + scatter_index = paddle.to_tensor(scatter_index_numpy) + grad = paddle.to_tensor(grad_numpy).cast("float32") + + y = combining(x, combine_weights, scatter_index) + paddle.autograd.backward([y], [grad], True) + return [x.grad, combine_weights.grad, y] + + +def test_moe_combine(x_numpy, combine_weights_numpy, scatter_index_numpy, grad_numpy): + """baseline_result""" + x = paddle.to_tensor(x_numpy).cast("float32") + x.stop_gradient = False + + combine_weights = paddle.to_tensor(combine_weights_numpy).cast("float32") + combine_weights.stop_gradient = False + + scatter_index = paddle.to_tensor(scatter_index_numpy).cast("int32") + grad = paddle.to_tensor(grad_numpy).cast("float32") + + y = GateCombine.apply(x, combine_weights, scatter_index) + paddle.autograd.backward([y], [grad], True) + # grad.backward() + return [x.grad, combine_weights.grad, y] + + +def gen_test_case(S, K, Dim, capacity_factor, seed=1234): + """gen_test_case""" + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) + x_numpy = np.random.rand(int(S * capacity_factor), Dim).astype(np.float32) + combine_weights_numpy = np.random.rand(S, K).astype(np.float32) + scatter_index_numpy = np.random.permutation(max(x_numpy.shape[0], S * K))[ + : S * K + ].astype("int64") + scatter_index_numpy = scatter_index_numpy.reshape([S, K]) + + combine_weights_numpy[scatter_index_numpy >= x_numpy.shape[0]] = 0 + scatter_index_numpy[scatter_index_numpy >= x_numpy.shape[0]] = 0 + grad_numpy = np.random.randn(S, Dim).astype(np.float32) + return x_numpy, combine_weights_numpy, scatter_index_numpy, grad_numpy + + +def testing(test_case): + """testing""" + [bl_x_grad, bl_combine_weights_grad, bl_y] = baseline_result(*test_case) + [fused_x_grad, fused_combine_weights_grad, fused_y] = test_moe_combine(*test_case) + np.testing.assert_allclose( + fused_y.astype("float32").numpy(), + bl_y.astype("float32").numpy(), + err_msg="fwd precision not pass", + rtol=1e-6, + ) + np.testing.assert_allclose( + fused_x_grad.astype("float32").numpy(), + bl_x_grad.astype("float32").numpy(), + rtol=1e-6, + err_msg="bwd grad precision not pass", + ) + np.testing.assert_allclose( + fused_combine_weights_grad.astype("float32").numpy(), + bl_combine_weights_grad.astype("float32").numpy(), + rtol=1e-6, + ) + + +class TestFused(unittest.TestCase): + @unittest.skipIf(moe_combine is None, "test_moe_combine not installed") + def test_cap_lt_2( + self, + ): + """ + 测试精度对齐的功能 + + Args: + 无参,没有任何参数。 + + Returns: + NoneType:测试通过时返回None;测试失败时抛出异常。 + + """ + testing(gen_test_case(S=1024, K=2, Dim=4096, capacity_factor=1.8)) + + @unittest.skipIf(moe_combine is None, "test_moe_combine not installed") + def test_cap_eq_2( + self, + ): + """ + 测试精度对齐的功能 + + Args: + 无参,没有任何参数。 + + Returns: + NoneType:测试通过时返回None;测试失败时抛出异常。 + + """ + testing(gen_test_case(S=1024, K=2, Dim=4096, capacity_factor=2)) + + @unittest.skipIf(moe_combine is None, "test_moe_combine not installed") + def test_cap_gt_2( + self, + ): + """ + 测试精度对齐的功能 + + Args: + 无参,没有任何参数。 + + Returns: + NoneType:测试通过时返回None;测试失败时抛出异常。 + + """ + testing(gen_test_case(S=1024, K=2, Dim=4096, capacity_factor=2.2)) + + @unittest.skipIf(moe_combine is None, "test_moe_combine not installed") + def test_k_gt_2( + self, + ): + """ + 测试精度对齐的功能 + + Args: + 无参,没有任何参数。 + + Returns: + NoneType:测试通过时返回None;测试失败时抛出异常。 + + """ + testing(gen_test_case(S=1024, K=8, Dim=4096, capacity_factor=2)) + + +if __name__ == "__main__": + + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py new file mode 100644 index 00000000000..4c209970629 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py @@ -0,0 +1,218 @@ +# ruff: noqa: C419 +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +from paddle.incubate.nn.functional import ( + moe_gate_dispatch, + moe_gate_dispatch_partial_nosoftmaxtopk, +) + + +def test_moe_dispatch_partial_nosoftmaxtopk_nonepad_op(): + + s, d, e = 4, 100, 8 + k, cap = 4, 3 + local_expert_num = 2 + + # x = paddle.randn([s, d]) + # gate_logits = paddle.randn([s, e]) + x = paddle.arange(1, s + 1).unsqueeze(-1).expand([s, d]).astype("bfloat16") + x_ = x.clone().detach() + + t = ( + (paddle.arange(0, e)).unsqueeze(0) + paddle.arange(0, -s, -1).unsqueeze(-1) + ) % e + gate_logits = (1 / (t + 1)).astype("float32") + # gate_logits = F.softmax(paddle.randn([s,e]),-1).astype('float32') + gate_logits_ = gate_logits.clone().detach() + s = x.shape[0] + d = x.shape[1] + e = gate_logits.shape[1] + x.stop_gradient = False + x_.stop_gradient = False + gate_logits.stop_gradient = False + gate_logits_.stop_gradient = False + print(f"gate_logits:{gate_logits}") + + def check_ascend(index_rev, chunks): + for idx in index_rev.split(chunks.tolist()): + if len(idx) > 2: + assert (paddle.diff(idx) >= 0).all(), (index_rev,) + + ys, comm, scatter_idx = [], [], [] + for ilocal_expert in range(0, e, local_expert_num): + combine_weihgts, expert_id = gate_logits.topk(k=k, axis=1) + ( + y, + combine_weihgts, + scatter_index, + scatter_index_rev, + expert_offset, + expert_num_local, + ) = moe_gate_dispatch_partial_nosoftmaxtopk( + x, + combine_weihgts, + expert_id.astype("int32"), + k=k, + capacity=cap, + num_experts=gate_logits.shape[-1], + use_pad=False, + expert_start_index=ilocal_expert, + expert_end_index=ilocal_expert + local_expert_num, # k # cap + reverse_token_drop=False, + ) + check_ascend(scatter_index_rev, expert_num_local) + print(f"y:{y.mean(-1)}") + print(f"combine_weihgts:{combine_weihgts}") + print(f"expert_num_local:{expert_num_local}") + print(f"scatter_index:{scatter_index.transpose([1,0])}") + print(f"scatter_index_rev:{scatter_index_rev}") + + ys.append(y) + comm.append(combine_weihgts) + scatter_idx.append(scatter_index) + + comm_sum = paddle.stack(comm).sum(0) + ys_sum = paddle.concat(ys) + + ( + y_, + combine_weihgts_, + scatter_index_, + expert_offset_, + expert_id_, + ) = moe_gate_dispatch( + x_, + gate_logits_, + None, + k=k, + capacity=cap, + use_pad=True, # k # cap + ) + valid_y = y_.sum(-1) > 0.0 + y_2 = y_[valid_y].squeeze() + + print( + f""" + y: {ys_sum.astype("float32").mean(axis=-1)} + y_: {y_2.astype("float32").mean(axis=-1)} + + comm-weight: {comm_sum} + comm-weight_: {combine_weihgts_} + + expert_id:{expert_id} + scatter_index:{scatter_index} + scatter_index_rev: {scatter_index_rev} + expert_num_global:{expert_offset} + expert_num_local:{expert_num_local} + """ + ) + + print("<<< begin backward>>>") + + assert combine_weihgts_.shape == combine_weihgts.shape, ( + combine_weihgts_.shape, + combine_weihgts.shape, + ) + + dysum, dcombine_weights_sum = paddle.ones_like(ys_sum), paddle.randn( + comm_sum.shape + ).astype(comm_sum.dtype) + dy_, dcombine_weights_ = paddle.ones_like(y_), paddle.ones_like(combine_weihgts_) + dy_[~valid_y] = 0 + + y_shapes = [len(y) for y in ys] + for dyy, yy, commm in zip( + paddle.split(dysum, y_shapes), + ys, + comm, + ): + print(f"dyy:{dyy.shape}, {yy.shape} {commm.shape}") + paddle.autograd.backward([yy, commm], [dyy, dcombine_weights_sum]) + print(x.grad.astype("float32").mean(axis=-1)) + print(f"bwd original:{y_.shape} {dy_.shape}") + paddle.autograd.backward([y_, combine_weihgts_], [dy_, dcombine_weights_]) + + print(x_.grad.astype("float32").mean(axis=-1)) + + print( + f""" + x: {x.grad.astype('float32').mean(axis=-1)} + x_: {x_.grad.astype('float32').mean(axis=-1)} + """ + ) + + +def test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop(): + + S, E, D = 3, 4, 3 + k = 2 + capacity = 2 + x = (paddle.arange(S) + 1).unsqueeze(-1).expand([S, D]).astype("bfloat16") + cw = paddle.randn([S, k]) + eid = paddle.to_tensor([[0, 1], [0, 1], [0, 2]], dtype="int32") # 1 # 2 # 3 + ( + y, + cw_, + idx, + idx_rev, + num_ex_global, + num_ex_local, + ) = moe_gate_dispatch_partial_nosoftmaxtopk( + x, cw, eid, k, capacity, E, False, 0, 2, reverse_token_drop=True + ) + + y0, y1 = y.split([i for i in num_ex_local.tolist() if i > 0]) + assert y0[:, 0].astype("int32").tolist() == [2, 3], y0[:, 0] + assert y1[:, 0].astype("int32").tolist() == [1, 2] + + +def test_moe_ops_partial_nosoftmax_topk_empty_output(): + + S, E, D = 3, 4, 3 + k = 2 + capacity = 2 + x = (paddle.arange(S) + 1).unsqueeze(-1).expand([S, D]).astype("bfloat16") + cw = paddle.randn([S, k]) + paddle.device.synchronize() + eid = paddle.to_tensor([[0, 1], [0, 1], [0, 2]], dtype="int32") # 1 # 2 # 3 + ( + y, + cw_, + idx, + idx_rev, + num_ex_global, + num_ex_local, + ) = moe_gate_dispatch_partial_nosoftmaxtopk( + x, cw, eid, k, capacity, E, False, 3, 4, reverse_token_drop=True + ) + assert all([i == 0 for i in num_ex_local.tolist()]), num_ex_local + + +class TestAddition(unittest.TestCase): + def test_moe_dispatch_partial_nosoftmaxtopk_nonepad_op(self): + test_moe_dispatch_partial_nosoftmaxtopk_nonepad_op() + + def test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop(self): + test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop() + + def test_moe_ops_partial_nosoftmax_topk_empty_output(self): + test_moe_ops_partial_nosoftmax_topk_empty_output() + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py new file mode 100644 index 00000000000..19752abd904 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py @@ -0,0 +1,207 @@ +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest + +import numpy as np + +import paddle +import paddle.nn.functional as F +from paddle.incubate.nn.functional import ( + moe_gate_dispatch, + moe_gate_dispatch_permute, +) + +os.environ["FLAGS_flash_attn_version"] = "v1" +os.environ["FLAGS_cudnn_deterministic"] = "1" +os.environ["FLAGS_embedding_deterministic"] = "1" + + +class TestFused(unittest.TestCase): + def test_moe_ops(self): + """ + test `moe-ops` w/ bias + """ + S, E, D = 8192, 64, 128 + k = 4 + x = paddle.randn([S, D], dtype="bfloat16") + gate_logits = paddle.randn([S, E], dtype="float32") + x_ = x.clone() + gate_logits_ = gate_logits.clone() + x.stop_gradient = True + x_.stop_gradient = True + gate_logits.stop_gradient = True + gate_logits_.stop_gradient = True + bias = paddle.zeros([E], dtype="float32") + cap = 512 + + ( + y, + combine_weihgts, + scatter_index, + expert_offset_, + expert_id_, + ) = moe_gate_dispatch( + x, + gate_logits, + None, + k=k, + capacity=cap, + use_pad=True, # k # cap + ) + + ( + y_, + combine_weihgts_, + scatter_index_, + expert_offset_, + expert_id_, + ) = moe_gate_dispatch( + x_, + gate_logits_, + bias + 1, # +1也不会破坏路由结果 + k=k, + capacity=cap, + use_pad=True, # k # cap + ) + bias_unbalanced = bias.clone() + bias_unbalanced[0] += 1 + ( + y__, + combine_weihgts__, + scatter_index__, + expert_offset__, + expert_id__, + ) = moe_gate_dispatch( + x_, + gate_logits_, + bias_unbalanced, + k=k, + capacity=cap, + use_pad=True, # k # cap + ) + np.testing.assert_equal( + y.astype("float32").numpy(), + y_.astype("float32").numpy(), + err_msg="incubate w bias not match", + ) + # bias 不影响 prob 概率 + np.testing.assert_equal( + combine_weihgts.astype("float32").numpy(), + combine_weihgts_.astype("float32").numpy(), + err_msg="incubate w bias not match", + ) + np.testing.assert_( + (y.astype("float32").numpy(0) != y__.astype("float32").numpy()).any(), + ) + + +class TestDispatchPermute(unittest.TestCase): + def get_detached_input(self, input, prob): + ret_input = input.detach() + ret_prob = prob.detach() + ret_input.stop_gradient = input.stop_gradient + ret_prob.stop_gradient = prob.stop_gradient + return ret_input, ret_prob + + def get_stage_input_list(self, x, world_size, stage): + print(world_size, stage, x.shape) + x = x.reshape([world_size * stage, -1, x.shape[-1]]) + stage_input_list = [] + x_list = paddle.split(x, num_or_sections=(world_size * stage), axis=0) + for stage_id in range(stage): + stage_input_list.append( + paddle.unsqueeze(paddle.concat(x_list[stage_id::stage], axis=0), axis=0) + ) + stage_input_list = paddle.concat(stage_input_list, axis=0) + return stage_input_list + + def test_moe_permute_ops(self): + paddle.seed(2025) + + test_cases = [ + (8, 4, 2), + (64, 16, 32), + (1024, 1024, 1024), + (8, 2, 4), + (4096, 4096, 4096), + ] + cases = list(zip(*test_cases)) + for _, case in enumerate(cases): + world_size, num_experts, num_tokens, k, hidden_size = case + capacity = num_tokens // k + stages = num_experts // world_size + + input = paddle.randn([num_tokens, hidden_size], dtype="float32") + prob_logits = paddle.randn([num_tokens, num_experts], dtype="float32") + prob = F.softmax(prob_logits, axis=-1) + input.stop_gradient = False + prob.stop_gradient = False + + compat_args = (None,) + + ref_input, ref_prob = self.get_detached_input(input, prob) + ( + ref_dispatched_input, + ref_combine_weights_unnorm, + ref_scatter_index, + ref_dispatch_mask, + _, + ) = moe_gate_dispatch( + ref_input, + ref_prob, + *compat_args, + k=k, + capacity=capacity, + use_pad=True, + ) + + ref_stage_input_list = self.get_stage_input_list( + ref_dispatched_input, world_size, stages + ) + + test_input, test_prob = self.get_detached_input(input, prob) + ( + test_dispatched_input, + test_combine_weights_unnorm, + test_scatter_index, + test_dispatch_mask, + _, + ) = moe_gate_dispatch_permute( + test_input, + test_prob, + *compat_args, + k=k, + capacity=capacity, + world_size=world_size, + ) + + np.testing.assert_equal( + test_dispatched_input.shape, + ref_stage_input_list.shape, + err_msg="moe_permute_ops not match", + ) + np.testing.assert_equal( + test_dispatched_input._md5sum(), + ref_stage_input_list._md5sum(), + err_msg="moe_permute_ops not match", + ) + + +if __name__ == "__main__": + + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py new file mode 100644 index 00000000000..14991becc47 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py @@ -0,0 +1,175 @@ +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +import paddle.nn.functional as F +from paddle.incubate.nn.functional import ( + moe_gate_dispatch, + moe_gate_dispatch_permute, +) + +batch_size = 4 +hidden_size = 2 +k = 16 +capacity = 2 +num_experts = 16 + +world_size = 2 + + +class TestLayer(paddle.nn.Layer): + def forward(self, x, gate_prob, k, capacity): + y, combine_weights, scatter_index, expert_offset, expert_id = moe_gate_dispatch( + x, gate_prob, None, k, capacity, True + ) + return y, combine_weights, scatter_index, expert_offset, expert_id + + +class TestLayerPermute(paddle.nn.Layer): + def forward(self, x, gate_prob, k, capacity): + ( + y, + combine_weights, + scatter_index, + expert_offset, + expert_id, + ) = moe_gate_dispatch_permute( + x, gate_prob, None, k, capacity, world_size=world_size + ) + return y, combine_weights, scatter_index, expert_offset, expert_id + + +def check_backward_correctness(layer_cls): + paddle.seed(1024) + + dtype = "bfloat16" + layer = layer_cls() + input = paddle.randn([batch_size, hidden_size]) + + gate_weight = paddle.randn([hidden_size, num_experts]) + logits = paddle.matmul(input, gate_weight) + gate_prob = F.softmax(logits, axis=-1) + print(f"gate_prob: {gate_prob}") + + input = paddle.cast(input, "bfloat16") + input.stop_gradient = False + gate_prob.stop_gradient = False + + output, combine_weights, scatter_index, expert_offset, expert_id = layer( + input, gate_prob, k, capacity + ) + + print(f"output: {output}") + print(f"combine_weights: {combine_weights}") + print(f"scatter_index: {scatter_index}") + print(f"expert_offset: {expert_offset}") + print(f"expert_id: {expert_id}") + + # output_g = paddle.randn(output.shape).astype(output.dtype) + # combine_weights_g = paddle.randn(combine_weights.shape).astype(combine_weights.dtype) + output_g = paddle.ones_like(output) + combine_weights_g = paddle.ones_like(combine_weights) + print(f"output_g: {output_g}") + print(f"combine_weights_g: {combine_weights_g}") + + paddle.autograd.backward( + tensors=[output, combine_weights], + grad_tensors=[output_g, combine_weights_g], + ) + # 数值估算 + epsilon = 0.005 + input_numpy = input.detach().astype("float32").numpy() + num_grad = paddle.zeros_like(input) + flattened = num_grad.reshape([-1]) + + for i in range(input.numel()): + input_pos = input_numpy.copy() + input_neg = input_numpy.copy() + input_pos.flat[i] += epsilon + input_neg.flat[i] -= epsilon + + output_pos, _, _, _, _ = layer( + paddle.to_tensor(input_pos), gate_prob, k, capacity + ) + output_neg, _, _, _, _ = layer( + paddle.to_tensor(input_neg), gate_prob, k, capacity + ) + + """ + flattened[i] = (output_pos.astype("float32").numpy() - output_neg.astype("float32").numpy()).sum() / ( + 2 * epsilon + ) + """ + grad_value = (output_pos - output_neg).sum() / (2 * epsilon) + flattened[i] = grad_value + + flattened = flattened.reshape(input.shape) + + print(f"input gradient: {input.grad}") + print(f"numerical gradient: {flattened}") + np.testing.assert_allclose( + input.grad.astype("float32").numpy(), + flattened.astype("float32").numpy(), + rtol=1e-5, + atol=0, + ) + + # 数值估算 gate_prob + epsilon = 0.0005 + gate_prob_numpy = gate_prob.detach().astype("float32").numpy() + num_grad = paddle.zeros_like(gate_prob) + flattened = num_grad.reshape([-1]) + + for i in range(gate_prob.numel()): + input_pos = gate_prob_numpy.copy() + input_neg = gate_prob_numpy.copy() + input_pos.flat[i] += epsilon + input_neg.flat[i] -= epsilon + + _, output_pos, _, _, _ = layer(input, paddle.to_tensor(input_pos), k, capacity) + _, output_neg, _, _, _ = layer(input, paddle.to_tensor(input_neg), k, capacity) + + grad_value = paddle.to_tensor( + (output_pos.numpy() - output_neg.numpy()).sum() / (2 * epsilon) + ) + flattened[i] = grad_value + + flattened = flattened.reshape(gate_prob.shape) + + print(f"gate_prob gradient: {gate_prob.grad}") + print(f"numerical gradient: {flattened}") + np.testing.assert_allclose( + gate_prob.grad.astype("float32").numpy(), + flattened.astype("float32").numpy(), + rtol=1e-4, + atol=0, + ) + + +class TestFused(unittest.TestCase): + def test_moe_backward(self): + check_backward_correctness(TestLayer) + + def test_moe_permute_backward(self): + check_backward_correctness(TestLayerPermute) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_layer_norm.py b/backends/metax_gpu/tests/unit_test/test_layer_norm.py new file mode 100644 index 00000000000..dbeaee31f6c --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_layer_norm.py @@ -0,0 +1,358 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import paddle + +from operator import mul +import paddle.base.core as core +import paddle.nn.functional as F +import paddle.base as base +from functools import reduce +from op_test import _set_use_system_allocator +from paddle.static.amp.fp16_utils import ( + _keep_layer_norm_scale_bias_to_fp32, +) +from paddle.pir_utils import OldIrGuard + +paddle.enable_static() + +np.random.random(123) + +_set_use_system_allocator(True) + + +def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1): + x_shape = x.shape + N = reduce(mul, x_shape[0:begin_norm_axis], 1) + D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1) + x.shape = [N, D] + + mean = np.mean(x, axis=1) + var = np.var(x, axis=1) + epsilon + output = np.divide((x - mean.reshape([N, 1])), (np.sqrt(var)).reshape([N, 1])) + if scale is not None: + output = scale.reshape([1, D]) * output + if beta is not None: + output = output + beta.reshape([1, D]) + + x.shape, output.shape = x_shape, x_shape + return output, mean, var + + +def _reference_layer_norm_grad(x, grad_y, scale, bias, mean, var, begin_norm_axis=1): + x_shape = x.shape + N = reduce(mul, x_shape[0:begin_norm_axis], 1) + D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1) + + if scale is not None: + scale_shape = scale.shape + scale.shape = [1, D] + x.shape, grad_y.shape = [N, D], [N, D] + var.shape, mean.shape = [N, 1], [N, 1] + + # d_bias + if bias is not None: + d_bias = np.sum(grad_y, axis=0).reshape([1, D]) + else: + d_bias = None + # d_scale + if scale is not None: + d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y, axis=0).reshape( + [1, D] + ) + else: + d_scale = None + # dx + if scale is not None: + dx_end = scale * np.sqrt(1.0 / var) * grad_y + d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape( + [N, 1] + ) # the second part equals to zero. + d_mean = 1.0 / D * d_mean_0 + d_std = np.sum(-(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape( + [N, 1] + ) * (1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean)) + else: + dx_end = 1.0 * np.sqrt(1.0 / var) * grad_y + d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * 1.0, axis=1).reshape( + [N, 1] + ) # the second part equals to zero. + d_mean = 1.0 / D * d_mean_0 + d_std = np.sum(-(1.0 / var) * (x - mean) * grad_y * 1.0, axis=1).reshape( + [N, 1] + ) * (1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean)) + + grad_x = dx_end + d_mean + d_std + + grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape + var.shape, mean.shape = [N], [N] + + if scale is not None: + scale.shape = scale_shape + return grad_x, d_scale, d_bias + + +class TestLayerNormOp(unittest.TestCase): + def setUp(self): + self.init_dtype() + self.place = paddle.CustomPlace("metax_gpu", 0) + self.__class__.use_custom_device = True + + def init_dtype(self): + self.dtype = np.float32 + + def __assert_close(self, tensor, np_array, msg, atol=1e-4): + np.testing.assert_allclose( + np.array(tensor), np_array, rtol=1e-4, atol=atol, err_msg=msg + ) + + def check_forward_backward( + self, + shape, + begin_norm_axis, + has_scale=True, + has_bias=True, + y_grad_scale=1.0, + use_mkldnn=False, + ): + def test_with_place(place, shape, begin_norm_axis, use_mkldnn=use_mkldnn): + # attr + epsilon = 0.00001 + x_shape = shape + D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1) + scale_shape = [D] + + np.random.seed(123) + x = np.random.random_sample(x_shape).astype(self.dtype) + scale = ( + np.random.random_sample(scale_shape).astype(np.float32) + if has_scale + else None + ) + bias = ( + np.random.random_sample(scale_shape).astype(np.float32) + if has_bias + else None + ) + y_grad = (np.random.random_sample(x_shape) * y_grad_scale).astype( + self.dtype + ) + + # reference forward & backward + y, mean, variance = _reference_layer_norm_naive( + x, scale, bias, epsilon, begin_norm_axis + ) + x_grad, scale_grad, bias_grad = _reference_layer_norm_grad( + x, y_grad, scale, bias, mean, variance, begin_norm_axis + ) + mean.shape = x_shape[0:begin_norm_axis] + variance.shape = x_shape[0:begin_norm_axis] + + var_dict = locals() + var_dict["y@GRAD"] = y_grad + var_names = ["x", "mean", "variance", "y", "y@GRAD"] + if has_scale: + var_names += ["scale"] + if has_bias: + var_names += ["bias"] + ground_truth = {name: var_dict[name] for name in var_names} + + with OldIrGuard(): + program = base.Program() + old_program_guard = base.program_guard + with old_program_guard(program): + block = program.global_block() + for name in ground_truth: + block.create_var( + name=name, dtype=self.dtype, shape=ground_truth[name].shape + ) + inputs = {"X": block.var("x")} + fetch_list = [ + "y", + "mean", + "variance", + "x@GRAD", + ] + if has_scale: + inputs["Scale"] = block.var("scale") + fetch_list += ["scale@GRAD"] + if has_bias: + inputs["Bias"] = block.var("bias") + fetch_list += ["bias@GRAD"] + layer_norm_op = block.append_op( + type="layer_norm", + inputs=inputs, + outputs={ + "Y": block.var("y"), + "Mean": block.var("mean"), # share the same memory + "Variance": block.var("variance"), # share the same memory + }, + attrs={ + "epsilon": epsilon, + "begin_norm_axis": begin_norm_axis, + "use_mkldnn": use_mkldnn, + }, + ) + # generate backward op_desc + grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( + layer_norm_op.desc, set(), [] + ) + grad_op_desc = grad_op_desc_list[0] + new_op_desc = block.desc.append_op() + new_op_desc.copy_from(grad_op_desc) + for var_name in grad_op_desc.output_arg_names(): + block.desc.var(var_name.encode("ascii")) + grad_op_desc.infer_var_type(block.desc) + grad_op_desc.infer_shape(block.desc) + for arg in grad_op_desc.output_arg_names(): + grad_var = block.desc.find_var(arg.encode("ascii")) + grad_var.set_dtype(core.VarDesc.VarType.FP32) + + program._sync_with_cpp() + exe = base.Executor(place) + with OldIrGuard(): + out = exe.run( + program, + feed={ + name: var_dict[name] + for name in ["x", "scale", "bias", "y@GRAD"] + }, + fetch_list=fetch_list, + ) + + self.__assert_close(y, out[0], "y") + self.__assert_close(mean, out[1], "mean") + self.__assert_close(variance, out[2], "variance", 1e-3) + self.__assert_close(x_grad, out[3], "x_grad") + if has_scale: + self.__assert_close( + scale_grad.reshape(-1), + out[fetch_list.index("scale@GRAD")], + "scale_grad", + 1e-3, + ) + if has_bias: + self.__assert_close( + bias_grad.reshape(-1), + out[fetch_list.index("bias@GRAD")], + "bias_grad", + ) + + test_with_place(self.place, shape, begin_norm_axis) + + def test_check_forward_backward_with_scale_and_bias(self): + self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1) + self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1) + self.check_forward_backward( + shape=[2, 3, 4, 5], begin_norm_axis=1, has_scale=False, has_bias=True + ) + self.check_forward_backward( + shape=[2, 3, 4, 5], begin_norm_axis=1, has_scale=True, has_bias=False + ) + self.check_forward_backward( + shape=[2, 3, 4, 5], begin_norm_axis=1, has_scale=False, has_bias=False + ) + self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3) + self.check_forward_backward( + shape=[92, 513, 129], begin_norm_axis=2, y_grad_scale=0.1 + ) + self.check_forward_backward(shape=[3, 34, 1134], begin_norm_axis=2) + self.check_forward_backward( + shape=[92, 513, 1134], begin_norm_axis=2, y_grad_scale=0.1 + ) + self.check_forward_backward( + shape=[92, 513, 1134], + begin_norm_axis=2, + has_scale=False, + has_bias=True, + y_grad_scale=0.1, + ) + self.check_forward_backward( + shape=[92, 513, 1134], + begin_norm_axis=2, + has_scale=True, + has_bias=False, + y_grad_scale=0.1, + ) + self.check_forward_backward( + shape=[92, 513, 1134], + begin_norm_axis=2, + has_scale=False, + has_bias=False, + y_grad_scale=0.1, + ) + self.check_forward_backward( + shape=[512, 1024], begin_norm_axis=1, has_scale=True, has_bias=True + ) + + +class TestFP16ScaleBiasLayerNorm(unittest.TestCase): + def check_main(self, x_np, weight_np, bias_np, dtype): + paddle.disable_static() + + weight_np = weight_np.astype(dtype) + bias_np = bias_np.astype(dtype) + + x = paddle.to_tensor(x_np) + weight = paddle.to_tensor(weight_np) + bias = paddle.to_tensor(bias_np) + x.stop_gradient = False + weight.stop_gradient = False + bias.stop_gradient = False + y = F.layer_norm(x, x.shape[1:], weight, bias) + x_g, w_g, b_g = paddle.grad(y, [x, weight, bias]) + y_np = y.numpy().astype("float32") + x_g_np = x_g.numpy().astype("float32") + w_g_np = w_g.numpy().astype("float16") + b_g_np = b_g.numpy().astype("float32") + + paddle.enable_static() + return y_np, x_g_np, w_g_np, b_g_np + + def test_main(self): + paddle.set_device("metax_gpu") + x_np = np.random.random([10, 20]).astype("float16") + weight_np = np.random.random([20]).astype("float16") + bias_np = np.random.random([20]).astype("float16") + + y_np_1, x_g_np_1, w_g_np_1, b_g_np_1 = self.check_main( + x_np, weight_np, bias_np, "float16" + ) + y_np_2, x_g_np_2, w_g_np_2, b_g_np_2 = self.check_main( + x_np, weight_np, bias_np, "float32" + ) + + def assert_equal(x, y): + np.testing.assert_allclose(x, y) + + assert_equal(y_np_1, y_np_2) + assert_equal(x_g_np_1, x_g_np_2) + assert_equal(w_g_np_1, w_g_np_2) + assert_equal(b_g_np_1, b_g_np_2) + + +class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase): + def test_main(self): + self.assertTrue(_keep_layer_norm_scale_bias_to_fp32()) + _keep_layer_norm_scale_bias_to_fp32(False) + self.assertFalse(_keep_layer_norm_scale_bias_to_fp32()) + _keep_layer_norm_scale_bias_to_fp32(True) + self.assertTrue(_keep_layer_norm_scale_bias_to_fp32()) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py b/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py new file mode 100644 index 00000000000..7545e16d14d --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py @@ -0,0 +1,395 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +from tests.op_test import OpTest +import paddle + +paddle.enable_static() +SEED = 2022 + + +def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0): + """Reference forward implementation using np.matmul.""" + # np.matmul does not support the transpose flags, so we manually + # transpose X and Y appropriately. + if transpose_X: + if X.ndim == 1: + X = X.reshape((X.size,)) + elif X.ndim == 2: + X = X.T + else: + dim = [i for i in range(len(X.shape))] + dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1] + X = np.transpose(X, tuple(dim)) + if transpose_Y: + if Y.ndim == 1: + Y = Y.reshape((Y.size,)) + else: + dim = [i for i in range(len(Y.shape))] + dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1] + Y = np.transpose(Y, tuple(dim)) + + Out = np.matmul(X, Y) + if abs(scale - 1.0) > 1e-09: + Out = Out * scale + return Out + + +class TestBmmOp(OpTest): + """ + case 0 + """ + + def set_metax_gpu(self): + self.__class__.use_custom_device = True + self.place = paddle.CustomPlace("metax_gpu", 0) + + def config(self): + self.x_shape = (10, 2, 5) + self.y_shape = (10, 5, 8) + + def init_kernel_type(self): + self.dtype = "float32" + + def setUp(self): + self.set_metax_gpu() + self.init_kernel_type() + self.config() + self.op_type = "bmm" + x = np.random.random(self.x_shape).astype(self.dtype) + y = np.random.random(self.y_shape).astype(self.dtype) + # -0.1 ~ 0.1 + x = -0.1 + 0.2 * x + y = -0.1 + 0.2 * y + result = reference_matmul(x, y) + result = result.astype(self.dtype) + self.inputs = { + "X": x, + "Y": y, + } + self.outputs = {"Out": result} + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-3) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X", "Y"], "Out") + + +class TestBmmOp1(TestBmmOp): + """ + case 1 + """ + + def config(self): + self.x_shape = (40, 10, 10) + self.y_shape = (40, 10, 10) + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-3) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X", "Y"], "Out") + + +class TestBmmOp2(TestBmmOp): + """ + case 2 + """ + + def config(self): + self.x_shape = (4, 10, 80) + self.y_shape = (4, 80, 1) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, + ["X", "Y"], + "Out", + max_relative_error=1e-2, + ) + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-3) + + +class TestMatMulOp(OpTest): + """ + basic case + """ + + def setUp(self): + self.set_metax_gpu() + self.op_type = "matmul_v2" + self.init_dtype() + self.init_alpha() + self.config() + + X = np.random.random(self.x_shape).astype(self.dtype) + Y = np.random.random(self.y_shape).astype(self.dtype) + # -0.1 ~ 0.1 + X = -0.1 + 0.2 * X + Y = -0.1 + 0.2 * Y + Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y, self.alpha) + Out = Out.astype(self.dtype) + self.inputs = {"X": X, "Y": Y} + self.attrs = { + "trans_x": self.transpose_X, + "trans_y": self.transpose_Y, + "alpha": self.alpha, + } + self.outputs = {"Out": Out} + + def set_metax_gpu(self): + self.__class__.use_custom_device = True + self.place = paddle.CustomPlace("metax_gpu", 0) + + def config(self): + self.x_shape = (100,) + self.y_shape = (100,) + self.transpose_X = False + self.transpose_Y = False + + def init_alpha(self): + self.alpha = 1.0 + + def init_dtype(self): + self.dtype = "float32" + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-7) + + def test_check_grad_normal(self): + self.check_grad_with_place(self.place, ["X", "Y"], "Out") + + +class TestMatMulOp1(TestMatMulOp): + """ + case x_ndim == 1, y_ndim != 1 + """ + + def config(self): + self.x_shape = (100,) + self.y_shape = (1, 3, 2, 100) + self.transpose_X = False + self.transpose_Y = True + + +class TestMatMulOp2(TestMatMulOp): + """ + case x_ndim != 1, y_ndim == 1 + """ + + def config(self): + self.x_shape = (1, 2, 100, 1) + self.y_shape = (100,) + self.transpose_X = True + self.transpose_Y = False + + +class TestMatMulOp3(TestMatMulOp): + """ + case [M, K] x [K, N] = [M, N] + """ + + def config(self): + self.x_shape = (2, 100) + self.y_shape = (100, 2) + self.transpose_X = False + self.transpose_Y = False + + +class TestMatMulOp4(TestMatMulOp): + """ + case [M, K] x [K, N] = [M, N] + """ + + def config(self): + self.x_shape = (2, 100) + self.y_shape = (2, 100) + self.transpose_X = False + self.transpose_Y = True + + +class TestMatMulOp5(TestMatMulOp): + """ + case [M, K] x [K, N] = [M, N] + """ + + def config(self): + self.x_shape = (100, 2) + self.y_shape = (100, 2) + self.transpose_X = True + self.transpose_Y = False + + +class TestMatMulOp6(TestMatMulOp): + """ + case [B, M, K] x [K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (2, 2, 25) + self.y_shape = (25, 4) + self.transpose_X = False + self.transpose_Y = False + + +class TestMatMulOp7(TestMatMulOp): + """ + case [B, M, K] x [K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (1, 4, 25) + self.y_shape = (4, 25) + self.transpose_X = False + self.transpose_Y = True + + +class TestMatMulOp8(TestMatMulOp): + """ + case [B, M, K] x [K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (1, 25, 4) + self.y_shape = (25, 4) + self.transpose_X = True + self.transpose_Y = False + + +class TestMatMulOp9(TestMatMulOp): + """ + case [B, M, K] x [B, K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (2, 5, 10) + self.y_shape = (2, 10, 5) + self.transpose_X = False + self.transpose_Y = False + + +class TestMatMulOp10(TestMatMulOp): + """ + case [B, M, K] x [B, K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (2, 10, 5) + self.y_shape = (2, 10, 5) + self.transpose_X = True + self.transpose_Y = False + + +class TestMatMulOp11(TestMatMulOp): + """ + case [B, M, K] x [B, K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (2, 5, 10) + self.y_shape = (2, 5, 10) + self.transpose_X = False + self.transpose_Y = True + + +class TestMatMulOp12(TestMatMulOp): + """ + case to check the gradient for special case + """ + + def config(self): + self.x_shape = 100 + self.y_shape = (1, 2, 2, 100, 2) + self.transpose_X = False + self.transpose_Y = False + + +class TestMatMulOp13(TestMatMulOp): + """ + case to check the gradient for special case + """ + + def config(self): + self.x_shape = (2, 1, 100) + self.y_shape = 100 + self.transpose_X = False + self.transpose_Y = False + + +# TODO(metax_gpu): alpha will be supported in next version +# --------------------test matmul alpha-------------------- +# def create_test_alpha_class(parent): +# class TestMatMulOpAlphaCase(parent): +# def init_alpha(self): +# self.alpha = 0.125 + +# cls_name = "{0}_{1}".format(parent.__name__, "Alpha") +# TestMatMulOpAlphaCase.__name__ = cls_name +# globals()[cls_name] = TestMatMulOpAlphaCase + +# create_test_alpha_class(TestMatMulOp) +# create_test_alpha_class(TestMatMulOp1) +# create_test_alpha_class(TestMatMulOp2) +# create_test_alpha_class(TestMatMulOp3) +# create_test_alpha_class(TestMatMulOp4) +# create_test_alpha_class(TestMatMulOp5) +# create_test_alpha_class(TestMatMulOp6) +# create_test_alpha_class(TestMatMulOp9) +# create_test_alpha_class(TestMatMulOp10) +# create_test_alpha_class(TestMatMulOp11) +# create_test_alpha_class(TestMatMulOp12) +# create_test_alpha_class(TestMatMulOp13) + + +# --------------------test matmul fp16-------------------- +def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5): + class TestMatMulOpFp16Case(parent): + def init_kernel_type(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=atol) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ["X", "Y"], "Out", max_relative_error=max_relative_error + ) + + cls_name = "{0}_{1}".format(parent.__name__, "Fp16") + TestMatMulOpFp16Case.__name__ = cls_name + globals()[cls_name] = TestMatMulOpFp16Case + + +create_test_fp16_class(TestMatMulOp) +create_test_fp16_class(TestMatMulOp1) +create_test_fp16_class(TestMatMulOp2) +create_test_fp16_class(TestMatMulOp3) +create_test_fp16_class(TestMatMulOp4) +create_test_fp16_class(TestMatMulOp5) +create_test_fp16_class(TestMatMulOp6) +create_test_fp16_class(TestMatMulOp9) +create_test_fp16_class(TestMatMulOp10) +create_test_fp16_class(TestMatMulOp11) +create_test_fp16_class(TestMatMulOp12) +create_test_fp16_class(TestMatMulOp13) + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py b/backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py new file mode 100644 index 00000000000..c9bccd2abb3 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py @@ -0,0 +1,220 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from op_test import OpTest, convert_float_to_uint16 + +import paddle +from paddle import base +from paddle.base import Program, program_guard + + +def call_nonzero(x): + input = paddle.to_tensor(x) + return paddle.nonzero(x=input) + + +class TestNonZeroAPI(unittest.TestCase): + def test_nonzero_api_as_tuple(self): + paddle.enable_static() + data = np.array([[1, 0], [0, 1]], dtype="float32") + with program_guard(Program(), Program()): + x = paddle.static.data(name="x", shape=[-1, 2], dtype="float32") + if not paddle.framework.use_pir_api(): + x.desc.set_need_check_feed(False) + y = paddle.nonzero(x, as_tuple=True) + self.assertEqual(type(y), tuple) + self.assertEqual(len(y), 2) + z = paddle.concat(list(y), axis=0) + exe = base.Executor(base.CPUPlace()) + + (res,) = exe.run(feed={"x": data}, fetch_list=[z], return_numpy=False) + expect_out = np.array([0, 1, 0, 1]) + np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05) + + data = np.array([1, 1, 0], dtype="float32") + with program_guard(Program(), Program()): + x = paddle.static.data(name="x", shape=[-1], dtype="float32") + if not paddle.framework.use_pir_api(): + x.desc.set_need_check_feed(False) + y = paddle.nonzero(x, as_tuple=True) + self.assertEqual(type(y), tuple) + self.assertEqual(len(y), 1) + z = paddle.concat(list(y), axis=0) + exe = base.Executor(base.CPUPlace()) + (res,) = exe.run(feed={"x": data}, fetch_list=[z], return_numpy=False) + expect_out = np.array([0, 1]) + np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05) + + data = np.zeros([10, 3, 0], dtype="float32") + with program_guard(Program(), Program()): + x = paddle.static.data(name="x", shape=[10, 3, 0], dtype="float32") + if not paddle.framework.use_pir_api(): + x.desc.set_need_check_feed(False) + y = paddle.nonzero(x, as_tuple=True) + self.assertEqual(type(y), tuple) + self.assertEqual(len(y), 3) + expect_out = np.zeros([0]) + for item in y: + np.testing.assert_array_equal(expect_out, item) + + def test_nonzero_api(self): + paddle.enable_static() + data = np.array([[1, 0], [0, 1]], dtype="float32") + with program_guard(Program(), Program()): + x = paddle.static.data(name="x", shape=[-1, 2], dtype="float32") + if not paddle.framework.use_pir_api(): + x.desc.set_need_check_feed(False) + y = paddle.nonzero(x) + exe = base.Executor(base.CPUPlace()) + (res,) = exe.run(feed={"x": data}, fetch_list=[y], return_numpy=False) + expect_out = np.array([[0, 0], [1, 1]]) + np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05) + + data = np.array([1, 1, 0], dtype="float32") + with program_guard(Program(), Program()): + x = paddle.static.data(name="x", shape=[-1], dtype="float32") + if not paddle.framework.use_pir_api(): + x.desc.set_need_check_feed(False) + y = paddle.nonzero(x) + exe = base.Executor(base.CPUPlace()) + (res,) = exe.run(feed={"x": data}, fetch_list=[y], return_numpy=False) + expect_out = np.array([[0], [1]]) + np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05) + + def test_dygraph_api(self): + data_x = np.array([[True, False], [False, True]]) + with base.dygraph.guard(): + x = paddle.to_tensor(data_x) + z = paddle.nonzero(x) + np_z = z.numpy() + expect_out = np.array([[0, 0], [1, 1]]) + + +# Base case +class TestNonzeroOp(OpTest): + def setUp(self): + """Test where_index op with random value""" + np.random.seed(2023) + self.op_type = "where_index" + self.python_api = call_nonzero + self.init_shape() + self.init_dtype() + + self.inputs = self.create_inputs() + self.outputs = self.return_outputs() + + def test_check_output(self): + self.check_output(check_pir=True, check_symbol_infer=False) + + def init_shape(self): + self.shape = [8, 8] + + def init_dtype(self): + self.dtype = np.float64 + + def create_inputs(self): + return {"Condition": np.random.randint(5, size=self.shape).astype(self.dtype)} + + def return_outputs(self): + return {"Out": np.transpose(np.nonzero(self.inputs["Condition"]))} + + +class TestNonzeroComplex64Op(TestNonzeroOp): + def init_shape(self): + self.shape = [1, 2, 3] + + def init_dtype(self): + self.dtype = np.complex64 + + +class TestNonzeroComplex128Op(TestNonzeroOp): + def init_shape(self): + self.shape = [1, 2, 3] + + def init_dtype(self): + self.dtype = np.complex128 + + +class TestNonzeroFP32Op(TestNonzeroOp): + def init_shape(self): + self.shape = [2, 10, 2] + + def init_dtype(self): + self.dtype = np.float32 + + +class TestNonzeroFP16Op(TestNonzeroOp): + def init_shape(self): + self.shape = [3, 4, 7] + + def init_dtype(self): + self.dtype = np.float16 + + +class TestNonzeroBF16(OpTest): + def setUp(self): + """Test where_index op with bfloat16 dtype""" + np.random.seed(2023) + self.op_type = "where_index" + self.python_api = call_nonzero + self.init_shape() + self.init_dtype() + + self.inputs = self.create_inputs() + self.outputs = self.return_outputs() + + def test_check_output(self): + self.check_output(check_pir=True, check_symbol_infer=False) + + def init_shape(self): + self.shape = [12, 9] + + def init_dtype(self): + self.dtype = np.uint16 + + def create_inputs(self): + return { + "Condition": convert_float_to_uint16( + np.random.randint(5, size=self.shape).astype(np.float32) + ) + } + + def return_outputs(self): + return {"Out": np.transpose(np.nonzero(self.inputs["Condition"]))} + + +class TestZeroSizeOp(TestNonzeroOp): + def init_shape(self): + self.shape = [0, 10] + + def init_dtype(self): + self.dtype = np.float64 + + +class TestZeroSizeOpCase2(TestNonzeroOp): + def init_shape(self): + self.shape = [0, 10] + + def init_dtype(self): + self.dtype = np.float64 + + def test_check_output(self): + self.check_output(check_pir=True, check_symbol_infer=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py b/backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py new file mode 100644 index 00000000000..c1bc46517b6 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py @@ -0,0 +1,215 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle +from tests.op_test import OpTest + +paddle.enable_static() + + +def p_norm(x, axis, porder, keepdims=False, reduce_all=False): + r = [] + if axis is None or reduce_all: + x = x.flatten() + if porder == np.inf: + r = np.amax(np.abs(x), keepdims=keepdims) + elif porder == -np.inf: + r = np.amin(np.abs(x), keepdims=keepdims) + else: + r = np.linalg.norm(x, ord=porder, keepdims=keepdims) + elif isinstance(axis, list or tuple) and len(axis) == 2: + if porder == np.inf: + axis = tuple(axis) + r = np.amax(np.abs(x), axis=axis, keepdims=keepdims) + elif porder == -np.inf: + axis = tuple(axis) + r = np.amin(np.abs(x), axis=axis, keepdims=keepdims) + elif porder == 0: + axis = tuple(axis) + r = x.astype(bool) + r = np.sum(r, axis, keepdims=keepdims) + elif porder == 1: + axis = tuple(axis) + r = np.sum(np.abs(x), axis, keepdims=keepdims) + else: + axis = tuple(axis) + xp = np.power(np.abs(x), porder) + s = np.sum(xp, axis=axis, keepdims=keepdims) + r = np.power(s, 1.0 / porder) + else: + if isinstance(axis, list): + axis = tuple(axis) + r = np.linalg.norm(x, ord=porder, axis=axis, keepdims=keepdims) + r = r.astype(x.dtype) + + return r + + +class TestPnormOp(OpTest): + def set_metax_gpu(self): + self.__class__.use_custom_device = True + + def setUp(self): + self.set_metax_gpu() + self.op_type = "p_norm" + self.init_test_case() + x = (np.random.random(self.shape) + 0.5).astype(self.dtype) + norm = p_norm(x, self.axis, self.porder, self.keepdim) + self.inputs = {"X": x} + self.attrs = { + "epsilon": self.epsilon, + "axis": self.axis, + "keepdim": self.keepdim, + "porder": float(self.porder), + } + self.outputs = {"Out": norm} + self.gradient = self.calc_gradient() + + def test_check_output(self): + if self.dtype == "float16": + self.check_output_with_place(paddle.CustomPlace("metax_gpu", 0), atol=5e-3) + else: + self.check_output_with_place(paddle.CustomPlace("metax_gpu", 0)) + + def test_check_grad(self): + self.check_grad_with_place( + paddle.CustomPlace("metax_gpu", 0), + ["X"], + "Out", + user_defined_grads=self.gradient, + ) + + def init_test_case(self): + self.shape = [2, 3, 4, 5] + self.axis = 1 + self.epsilon = 1e-12 + self.porder = 2.0 + self.keepdim = False + self.init_dtype() + + def init_dtype(self): + self.dtype = "float32" + + def calc_gradient(self): + self.attrs = { + "epsilon": self.epsilon, + "axis": self.axis, + "keepdim": self.keepdim, + "porder": float(self.porder), + } + x = self.inputs["X"] + porder = self.attrs["porder"] + axis = self.attrs["axis"] + if porder == 0: + grad = np.zeros(x.shape).astype(x.dtype) + elif porder in [float("inf"), float("-inf")]: + norm = p_norm(x, axis=axis, porder=porder, keepdims=True) + x_abs = np.abs(x) + grad = np.sign(x) + grad[x_abs != norm] = 0.0 + else: + norm = p_norm(x, axis=axis, porder=porder, keepdims=True) + grad = ( + np.power(norm, 1 - porder) + * np.power(np.abs(x), porder - 1) + * np.sign(x) + ) + + numel = 1 + for s in x.shape: + numel *= s + numel /= x.shape[axis] + return [grad.astype(x.dtype) * 1 / numel] + + +class TestPnormOp2(TestPnormOp): + def init_test_case(self): + self.shape = [3, 20, 3] + self.axis = 2 + self.epsilon = 1e-12 + self.porder = 2.0 + self.keepdim = True + self.init_dtype() + + +# class TestPnormOp3(TestPnormOp): +# def init_test_case(self): +# self.shape = [3, 20, 3] +# self.axis = 2 +# self.epsilon = 1e-12 +# self.porder = np.inf +# self.keepdim = True +# self.init_dtype() + + +# class TestPnormOp4(TestPnormOp3): +# def init_test_case(self): +# self.shape = [3, 20, 3] +# self.axis = 2 +# self.epsilon = 1e-12 +# self.porder = -np.inf +# self.keepdim = True +# self.init_dtype() + + +class TestPnormOp5(TestPnormOp): + def init_test_case(self): + self.shape = [3, 20, 3] + self.axis = 2 + self.epsilon = 1e-12 + self.porder = 0 + self.keepdim = True + self.init_dtype() + + +# class TestPnormOp6(TestPnormOp): +# def init_test_case(self): +# self.shape = [2, 3, 4, 5] +# self.axis = 1 +# self.epsilon = 1e-12 +# self.porder = 0.5 +# self.keepdim = False +# self.init_dtype() + + +class TestPnormOpfp16(TestPnormOp): + def init_dtype(self): + self.dtype = "float16" + + +class TestPnormOp2fp16(TestPnormOp2): + def init_dtype(self): + self.dtype = "float16" + + +# class TestPnormOp3fp16(TestPnormOp3): +# def init_dtype(self): +# self.dtype = "float16" + + +# class TestPnormOp4fp16(TestPnormOp4): +# def init_dtype(self): +# self.dtype = "float16" + + +class TestPnormOp5fp16(TestPnormOp5): + def init_dtype(self): + self.dtype = "float16" + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py b/backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py new file mode 100644 index 00000000000..c67e807397c --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py @@ -0,0 +1,125 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest + +# import sys + +# sys.path.append("..") + +import numpy as np + +import paddle +from tests.op_test import OpTest + +paddle.enable_static() + + +# Correct: General. +class TestSqueezeOp(OpTest): + def setUp(self): + self.op_type = "squeeze2" + self.init_test_case() + self.set_metax_gpu() + self.inputs = {"X": np.random.random(self.ori_shape).astype("float64")} + self.init_attrs() + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + } + + def set_metax_gpu(self): + self.__class__.use_custom_device = True + self.place = paddle.CustomPlace("metax_gpu", 0) + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + def init_test_case(self): + self.ori_shape = (1, 3, 1, 40) + self.axes = (0, 2) + self.new_shape = (3, 40) + + def init_attrs(self): + self.attrs = {"axes": self.axes} + + +# class TestSqueezeBF16Op(OpTest): +# def setUp(self): +# self.op_type = "squeeze2" +# self.dtype = np.uint16 +# self.init_test_case() +# self.set_metax_gpu() +# x = np.random.random(self.ori_shape).astype("float32") +# out = x.reshape(self.new_shape) +# self.inputs = {"X": convert_float_to_uint16(x)} +# self.init_attrs() +# self.outputs = {"Out": convert_float_to_uint16(out)} + +# def set_metax_gpu(self): +# self.__class__.use_custom_device = True +# self.place = paddle.CustomPlace("metax_gpu", 0) + +# def test_check_output(self): +# self.check_output() + +# def test_check_grad(self): +# self.check_grad(["X"], "Out") + +# def init_test_case(self): +# self.ori_shape = (1, 3, 1, 40) +# self.axes = (0, 2) +# self.new_shape = (3, 40) + +# def init_attrs(self): +# self.attrs = {"axes": self.axes} + + +# Correct: There is mins axis. +class TestSqueezeOp1(TestSqueezeOp): + def init_test_case(self): + self.ori_shape = (1, 3, 1, 40) + self.axes = (0, -2) + self.new_shape = (3, 40) + + +# Correct: No axes input. +class TestSqueezeOp2(TestSqueezeOp): + def init_test_case(self): + self.ori_shape = (1, 20, 1, 5) + self.axes = () + self.new_shape = (20, 5) + + +# Correct: Just part of axes be squeezed. +class TestSqueezeOp3(TestSqueezeOp): + def init_test_case(self): + self.ori_shape = (6, 1, 5, 1, 4, 1) + self.axes = (1, -1) + self.new_shape = (6, 5, 1, 4) + + +# Correct: The demension of axis is not of size 1 remains unchanged. +class TestSqueezeOp4(TestSqueezeOp): + def init_test_case(self): + self.ori_shape = (6, 1, 5, 1, 4, 1) + self.axes = (1, 2) + self.new_shape = (6, 5, 1, 4, 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_swiglu_metax.py b/backends/metax_gpu/tests/unit_test/test_swiglu_metax.py new file mode 100644 index 00000000000..40e46e70a21 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_swiglu_metax.py @@ -0,0 +1,295 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from op_test import OpTest + +import paddle +import paddle.distributed as dist +import paddle.nn.functional as F +from paddle import _C_ops +from paddle.base import core +from paddle.distributed.auto_parallel.static.dist_attribute import ( + DistTensorSpec, + TensorDistAttr, +) +from paddle.incubate.nn.functional import swiglu as fused_swiglu_impl + + +def swiglu(x, y, out_grad): + if isinstance(x, np.ndarray): + x = paddle.to_tensor(x) + y = paddle.to_tensor(y) + out_grad = paddle.to_tensor(out_grad) + + origin_x = x.detach().clone() + origin_x.stop_gradient = False + x = origin_x + + origin_y = y.detach().clone() + origin_y.stop_gradient = False + y = origin_y + + dtype = x.dtype + need_convert = False + assert dtype == y.dtype + output_dtype = dtype + if paddle.is_compiled_with_cuda(): + if dtype in [paddle.float16, paddle.bfloat16]: + output_dtype = paddle.float32 + x = x.astype(output_dtype) + y = y.astype(output_dtype) + need_convert = True + + out = F.silu(x) * y + if need_convert: + out = out.astype(dtype) + out.backward(out_grad) + ret = [ + out.astype(output_dtype), + origin_x.grad.astype(output_dtype), + origin_y.grad.astype(output_dtype), + ] + return ret + + +def fused_swiglu(x, y, out_grad): + x = x.detach().clone() + x.stop_gradient = False + if y is not None: + y = y.detach().clone() + y.stop_gradient = False + out = fused_swiglu_impl(x, y) + out.backward(out_grad) + + output_dtype = x.dtype + if paddle.is_compiled_with_cuda(): + if x.dtype in [paddle.float16, paddle.bfloat16]: + output_dtype = paddle.float32 + ret = [ + out.astype(output_dtype), + ] + if y is not None: + x_grad, y_grad = x.grad, y.grad + else: + x_grad, y_grad = paddle.split(x.grad, 2, axis=-1) + + ret.append(x_grad.astype(output_dtype)) + ret.append(y_grad.astype(output_dtype)) + return ret + + +tol_map = { + paddle.float64: [1e-8, 1e-8], + paddle.float32: [1e-6, 1e-6], + paddle.float16: [1e-3, 1e-3], + paddle.bfloat16: [1e-3, 1e-3], +} + + +class TestSwiGLUDygraph(unittest.TestCase): + def check_dygraph_impl(self, device, shape, dtype): + x = paddle.randn(shape, dtype=dtype) + y = paddle.randn(shape, dtype=dtype) + out_grad = paddle.randn(shape, dtype=dtype) + + ret1 = swiglu(x, y, out_grad) + ret2 = fused_swiglu(x, y, out_grad) + ret3 = fused_swiglu(paddle.concat([x, y], axis=-1), None, out_grad) + + atol, rtol = tol_map[dtype] + err_msg = f"Failed when device = {device}, dtype = {dtype}, shape = {shape}" + for t1, t2, t3 in zip(ret1, ret2, ret3): + t1, t2, t3 = t1.numpy(), t2.numpy(), t3.numpy() + np.testing.assert_allclose(t1, t2, atol=atol, rtol=rtol, err_msg=err_msg) + np.testing.assert_equal(t2, t3, err_msg=err_msg) + + def check_dygraph(self, shape): + metas = [("cpu", paddle.float32), ("cpu", paddle.float64)] + if paddle.is_compiled_with_cuda(): + metas.append(("gpu", paddle.float32)) + metas.append(("gpu", paddle.float64)) + metas.append(("gpu", paddle.float16)) + prop = paddle.device.cuda.get_device_properties() + if prop.major >= 8: + metas.append(("gpu", paddle.bfloat16)) + + for device, dtype in metas: + origin_device = paddle.get_device() + paddle.set_device(device) + for with_split in [True]: + self.check_dygraph_impl(device, shape, dtype) + paddle.set_device(origin_device) + + def check_static_graph(self, shape, dtype="float32"): + x = paddle.static.data(name="x", shape=shape, dtype=dtype) + y = paddle.static.data(name="y", shape=shape, dtype=dtype) + concated_x = paddle.static.data( + name="concated_x", + shape=[*shape[:-1], shape[-1] * 2], + dtype=dtype, + ) + out1 = fused_swiglu_impl(x, y) + out2 = fused_swiglu_impl(concated_x) + + concated_x_np = np.random.random(concated_x.shape).astype(dtype) + x_np, y_np = np.split(concated_x_np, 2, axis=-1) + + exe = paddle.static.Executor() + t1, t2 = exe.run( + feed={"x": x_np, "y": y_np, "concated_x": concated_x_np}, + fetch_list=[out1, out2], + ) + np.testing.assert_equal(t1, t2) + + def check_main(self, shape): + self.check_dygraph(shape) + paddle.enable_static() + with paddle.static.program_guard( + paddle.static.Program(), paddle.static.Program() + ): + self.check_static_graph(shape) + paddle.disable_static() + + def test_main(self): + self.check_main([8, 100]) + self.check_main([4, 101]) + + +class TestSwigluOp(OpTest): + def config(self): + self.x_shape = (8, 128) + self.check_auto_parallel = True + + def setUp(self): + self.config() + self.op_type = "swiglu" + self.prim_op_type = "comp" + self.python_api = fused_swiglu_impl + self.public_python_api = fused_swiglu_impl + x = np.random.uniform(-1, 1, self.x_shape).astype("float64") + y = np.random.uniform(-1, 1, self.x_shape).astype("float64") + out_grad = np.random.uniform(-1, 1, self.x_shape).astype("float64") + res = swiglu(x, y, out_grad) + self.inputs = {"x": x, "y": y} + self.outputs = {"out": res[0].numpy()} + self.placements = { + "x": [dist.Shard(1)], + "y": [dist.Shard(1)], + "out": [dist.Shard(1)], + } + + def test_check_output(self): + self.check_output(check_prim_pir=True) + + def test_check_grad(self): + self.check_grad( + ["x", "y"], + "out", + check_auto_parallel=self.check_auto_parallel, + check_dygraph=1, + check_prim_pir=True, + ) + + +class TestSwigluOp2(TestSwigluOp): + def setUp(self): + self.config() + self.op_type = "swiglu" + self.prim_op_type = "comp" + self.python_api = fused_swiglu_impl + self.public_python_api = fused_swiglu_impl + x = np.random.uniform(-1, 1, self.x_shape).astype("float64") + tmp_inputs = np.split(x, 2, axis=-1) + x = tmp_inputs[0] + y = tmp_inputs[1] + out_grad = np.random.uniform(-1, 1, x.shape).astype("float64") + res = swiglu(x, y, out_grad) + self.inputs = {"x": x, "y": y} + self.outputs = {"out": res[0].numpy()} + self.placements = { + "x": [dist.Shard(1)], + "y": [dist.Shard(1)], + "out": [dist.Shard(1)], + } + + +@unittest.skipIf( + not paddle.base.core.is_compiled_with_dist(), + "The spmd rule is should be tested with distributed=ON", +) +class TestSwigluSpmd(unittest.TestCase): + def setUp(self): + self.kernel = "swiglu" + self.rule = paddle.base.core.get_phi_spmd_rule(self.kernel) + x_shape = [64, 32] + process_mesh = dist.ProcessMesh(mesh=[0, 1, 2, 3]) + x_tensor_dist_attr = TensorDistAttr() + x_tensor_dist_attr.dims_mapping = [-1, 0] + x_tensor_dist_attr.process_mesh = process_mesh + self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr) + self.y_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr) + self.out_dist_tensor_spec = DistTensorSpec(self.x_dist_tensor_spec) + + def test_input_x_y(self): + result_dist_attrs = self.rule.infer_forward( + self.x_dist_tensor_spec, self.y_dist_tensor_spec + ) + inferred_input_dist_attrs = result_dist_attrs[0] + inferred_output_dist_attrs = result_dist_attrs[1] + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(inferred_input_dist_attrs), 2) + self.assertEqual(len(inferred_output_dist_attrs), 1) + self.assertEqual(inferred_output_dist_attrs[0].dims_mapping, [-1, 0]) + + def test_input_x_unshard_last_dim(self): + x_shape = [64, 32] + process_mesh = dist.ProcessMesh(mesh=[0, 1, 2, 3]) + x_tensor_dist_attr = TensorDistAttr() + x_tensor_dist_attr.dims_mapping = [0, -1] + x_tensor_dist_attr.process_mesh = process_mesh + self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr) + + result_dist_attrs = self.rule.infer_forward( + self.x_dist_tensor_spec, DistTensorSpec() + ) + inferred_input_dist_attrs = result_dist_attrs[0] + inferred_output_dist_attrs = result_dist_attrs[1] + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(inferred_input_dist_attrs), 2) + self.assertEqual(len(inferred_output_dist_attrs), 1) + self.assertEqual(inferred_output_dist_attrs[0].dims_mapping, [0, -1]) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), "mamtul 0 size only with in cuda") +class TestSwiglu0SizeDygraph(unittest.TestCase): + def test_swiglu(self): + x = paddle.ones([0, 128], dtype="float32") + y = paddle.ones([0, 128], dtype="float32") + x.stop_gradient = False + y.stop_gradient = False + out = fused_swiglu_impl(x, y) + + dz = paddle.ones([0, 128], dtype="float32") + + out = _C_ops.swiglu_grad(x, y, dz) + + self.assertEqual(out[0].shape, x.shape) + self.assertEqual(out[1].shape, y.shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py b/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py new file mode 100644 index 00000000000..4369972255d --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py @@ -0,0 +1,162 @@ +# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. +# # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle + + +def TopPProcess(probs, top_p): + sorted_probs = paddle.sort(probs, descending=True) + sorted_indices = paddle.argsort(probs, descending=True) + cumulative_probs = paddle.cumsum(sorted_probs, axis=-1) + + # Remove tokens with cumulative probs above the top_p, But keep at + # least min_tokens_to_keep tokens + sorted_indices_to_remove = cumulative_probs > top_p + + # Keep the first token + sorted_indices_to_remove = paddle.cast(sorted_indices_to_remove, dtype="int64") + + sorted_indices_to_remove = paddle.static.setitem( + sorted_indices_to_remove, + (slice(None), slice(1, None)), + sorted_indices_to_remove[:, :-1].clone(), + ) + sorted_indices_to_remove = paddle.static.setitem( + sorted_indices_to_remove, (slice(None), 0), 0 + ) + + # Scatter sorted tensors to original indexing + sorted_indices = ( + sorted_indices + paddle.arange(probs.shape[0]).unsqueeze(-1) * probs.shape[-1] + ) + condition = paddle.scatter( + sorted_indices_to_remove.flatten(), + sorted_indices.flatten(), + sorted_indices_to_remove.flatten(), + ) + condition = paddle.cast(condition, "bool").reshape(probs.shape) + probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs) + next_tokens = paddle.multinomial(probs) + next_scores = paddle.index_sample(probs, next_tokens) + return next_scores, next_tokens + + +class TestTopPAPI(unittest.TestCase): + def setUp(self): + self.topp = 0.0 + self.seed = 6688 + self.batch_size = 3 + self.vocab_size = 10000 + self.dtype = "float32" + self.input_data = np.random.rand(self.batch_size, self.vocab_size) + + def run_dygraph(self, place): + with paddle.base.dygraph.guard(place): + input_tensor = paddle.to_tensor(self.input_data, self.dtype) + topp_tensor = paddle.to_tensor( + [ + self.topp, + ] + * self.batch_size, + self.dtype, + ).reshape((-1, 1)) + + # test case for basic test case 1 + paddle_result = paddle.tensor.top_p_sampling( + input_tensor, topp_tensor, seed=self.seed + ) + ref_res = TopPProcess(input_tensor, self.topp) + + np.testing.assert_allclose( + paddle_result[0].numpy(), ref_res[0].numpy(), rtol=1e-05 + ) + np.testing.assert_allclose( + paddle_result[1].numpy().flatten(), + ref_res[1].numpy().flatten(), + rtol=0, + ) + + # test case for basic test case 1 + paddle_result = paddle.tensor.top_p_sampling( + input_tensor, + topp_tensor, + seed=-1, + k=5, + mode="non-truncated", + return_top=True, + ) + ref_res = TopPProcess(input_tensor, self.topp) + + np.testing.assert_allclose( + paddle_result[0].numpy(), ref_res[0].numpy(), rtol=1e-05 + ) + np.testing.assert_allclose( + paddle_result[1].numpy().flatten(), + ref_res[1].numpy().flatten(), + rtol=0, + ) + + def run_static(self, place): + paddle.enable_static() + with paddle.static.program_guard( + paddle.static.Program(), paddle.static.Program() + ): + input_tensor = paddle.static.data( + name="x", shape=[6, 1030], dtype=self.dtype + ) + topp_tensor = paddle.static.data( + name="topp", shape=[6, 1], dtype=self.dtype + ) + result = paddle.tensor.top_p_sampling( + input_tensor, topp_tensor, seed=self.seed + ) + ref_res = TopPProcess(input_tensor, self.topp) + exe = paddle.static.Executor(place) + input_data = np.random.rand(6, 1030).astype(self.dtype) + paddle_result = exe.run( + feed={ + "x": input_data, + "topp": np.array( + [ + self.topp, + ] + * 6 + ).astype(self.dtype), + }, + fetch_list=[ + result[0], + result[1], + ref_res[0], + ref_res[1], + ], + ) + np.testing.assert_allclose(paddle_result[0], paddle_result[2], rtol=1e-05) + np.testing.assert_allclose(paddle_result[1], paddle_result[3], rtol=1e-05) + + def test_dygraph(self): + place = paddle.CustomPlace("metax_gpu", 0) + self.run_dygraph(place) + + def test_static(self): + place = paddle.CustomPlace("metax_gpu", 0) + self.run_static(place) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py b/backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py new file mode 100644 index 00000000000..ff22c2c9ac9 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py @@ -0,0 +1,98 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest + +from tests.op_test import OpTest +import paddle + +paddle.enable_static() + + +# Correct: General. +class TestUnsqueezeOp(OpTest): + def setUp(self): + self.set_metax_gpu() + self.op_type = "unsqueeze2" + self.dtype = "float32" + self.init_test_case() + self.inputs = {"X": np.random.random(self.ori_shape).astype(self.dtype)} + self.init_attrs() + self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} + + def set_metax_gpu(self): + self.__class__.use_custom_device = True + self.place = paddle.CustomPlace("metax_gpu", 0) + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + def init_test_case(self): + self.ori_shape = (3, 40) + self.axes = (1, 2) + self.new_shape = (3, 1, 1, 40) + + def init_attrs(self): + self.attrs = {"axes": self.axes} + + +# Correct: Single input index. +class TestUnsqueezeOp1(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = (-1,) + self.new_shape = (20, 5, 1) + + +# Correct: Mixed input axis. +class TestUnsqueezeOp2(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = (0, -1) + self.new_shape = (1, 20, 5, 1) + + +# Correct: There is duplicated axis. +class TestUnsqueezeOp3(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (10, 2, 5) + self.axes = (0, 3, 3) + self.new_shape = (1, 10, 2, 1, 1, 5) + + +# Correct: Reversed axes. +class TestUnsqueezeOp4(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (10, 2, 5) + self.axes = (3, 1, 1) + self.new_shape = (10, 1, 1, 2, 5, 1) + + +# test float16 +class TestUnsqueezeOp5(TestUnsqueezeOp): + def init_test_case(self): + self.dtype = "float16" + self.ori_shape = (10, 2, 5) + self.axes = (3, 1, 1) + self.new_shape = (10, 1, 1, 2, 5, 1) + + +if __name__ == "__main__": + unittest.main() From 528ec55971cd8e115b3d0a7e2103bd4ebf7493a5 Mon Sep 17 00:00:00 2001 From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Date: Tue, 16 Sep 2025 11:39:34 +0800 Subject: [PATCH 11/58] [Metax] update metax CI CMakeLists (#16) * [Metax] update metax CI * [Metax] update metax CI CMakeLists --- backends/metax_gpu/tests/CMakeLists.txt | 44 +++++++++++++++---------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt index 7e549ef4eaa..37475773026 100755 --- a/backends/metax_gpu/tests/CMakeLists.txt +++ b/backends/metax_gpu/tests/CMakeLists.txt @@ -87,24 +87,32 @@ list( list( REMOVE_ITEM PYTHON_TEST_SCRIPTS - ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py) + ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py # 精度问题 + ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py # 受 test_sum_op.py 影响 + ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py # 精度问题 + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py # core.cudnnversion + # 适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py # core.cudnnversion 适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py # core.cudnnversion 适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py # op_test.py 里 + # self._get_places() + # 接口适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py # device == "gpu" 适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py # paddle-gpu 报错一致 + ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py # paddle-gpu 报错一致 + ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py # core.cudnnversion 适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py # paddle-gpu 报错一致 + ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py # paddle-gpu 报错一致 + ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py # paddle.device.cuda.get_device_properties + ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py # needs check_grad with fp64 + # precision + ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py # op_test.py 里 + # self._get_places() 接口适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py # CUDAPinnedPlace 问题 + ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py # paddle.device.cuda.get_device_properties + ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py # CUDAPinnedPlace 问题 + ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py # paddle.device.cuda.get_device_properties +) list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS) foreach(test_script ${PYTHON_TEST_SCRIPTS}) From 5b31405c13c32af5dbc826f7e8fec58e64a74322 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Tue, 16 Sep 2025 15:02:29 +0800 Subject: [PATCH 12/58] [Metax] add github action (#18) * [Metax] add github action --------- Co-authored-by: Mingkun.Zhang <2496808993@qq.com> Co-authored-by: metax666 Co-authored-by: jiaxinWang-metax <189149612@qq.com> Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Co-authored-by: chezhang <1376507468@qq.com> Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com> Co-authored-by: ZhouDuan <1184319564@qq.com> --- .github/workflows/metax_work.yaml | 52 +++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 .github/workflows/metax_work.yaml diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml new file mode 100644 index 00000000000..0d3d2637cdd --- /dev/null +++ b/.github/workflows/metax_work.yaml @@ -0,0 +1,52 @@ +name: padlle metax gpu test + +on: + workflow_dispatch: + pull_request: + types: [opened, synchronize] + branches: [develop, release/**] + paths: + - "**" + - "!backends/**" + - "backends/metax_gpu/**" + +permissions: read-all + +defaults: + run: + shell: bash + +jobs: + metax-gpu-test: + runs-on: paddle-metax-runner-set + steps: + - name: Checkout repository + run: | + git config --global user.name "GitHub Actions" + git config --global user.email "actions@github.com" + + if [ "${{ github.event_name }}" == "pull_request" ]; then + BRANCH_NAME=${{ github.head_ref }} + else + BRANCH_NAME=${{ github.ref_name }} + fi + + git clone \ + --reference-if-able /home/runner/PaddleCustomDevice \ + --depth=1 \ + --shallow-submodules \ + --jobs=8 \ + --branch $BRANCH_NAME \ + --recurse-submodules \ + https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git . + + + - name: compile + run: | + cd backends/metax_gpu + bash build.sh + + - name: run test + run: | + cd backends/metax_gpu/tests + bash run_test.sh From b93c971b17729f09733faf5400d7ba44f1e5f3f2 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Tue, 16 Sep 2025 15:15:34 +0800 Subject: [PATCH 13/58] [metax] chang build (#19) * [metax]chaneg build --------- Co-authored-by: Mingkun.Zhang <2496808993@qq.com> Co-authored-by: metax666 Co-authored-by: jiaxinWang-metax <189149612@qq.com> Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Co-authored-by: chezhang <1376507468@qq.com> Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com> Co-authored-by: ZhouDuan <1184319564@qq.com> --- backends/metax_gpu/build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh index dd0ab3aab90..c288ea22312 100755 --- a/backends/metax_gpu/build.sh +++ b/backends/metax_gpu/build.sh @@ -20,7 +20,7 @@ set -e pip uninstall paddlepaddle -y -export http_proxy=http://10.2.192.21:1080 https_proxy=http://10.2.192.21:1080 +# export http_proxy=http://10.2.192.21:1080 https_proxy=http://10.2.192.21:1080 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package # install paddle python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ @@ -50,7 +50,7 @@ fi echo "make_maca" cd build cmake_maca .. -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON -make_maca -j8 +make_maca -j60 echo "install whl" pip install dist/paddle_metax_gpu*.whl --force-reinstall From 6dbbe848d672a27bbbdded8e399ff5b1229c6647 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Tue, 16 Sep 2025 16:04:55 +0800 Subject: [PATCH 14/58] change_build (#20) * [metax]chaneg build --------- --- backends/metax_gpu/build.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh index c288ea22312..e52cddc6476 100755 --- a/backends/metax_gpu/build.sh +++ b/backends/metax_gpu/build.sh @@ -20,15 +20,18 @@ set -e pip uninstall paddlepaddle -y -# export http_proxy=http://10.2.192.21:1080 https_proxy=http://10.2.192.21:1080 +# init paddle +git submodule sync --recursive && git submodule update --init --recursive + + +export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package # install paddle python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ # exit 1 -# init paddle -git submodule sync --recursive && git submodule update --init --recursive +unset http_proxy https_proxy # apply patch bash change_patch.sh From ef1b28e5d17ceac419de30f8ba129f16444bd39d Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Tue, 16 Sep 2025 16:18:54 +0800 Subject: [PATCH 15/58] change_build (#21) --- backends/metax_gpu/build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh index e52cddc6476..a40cac19e19 100755 --- a/backends/metax_gpu/build.sh +++ b/backends/metax_gpu/build.sh @@ -25,6 +25,7 @@ git submodule sync --recursive && git submodule update --init --recursive export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080 +export pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package # install paddle python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ From 3737e488da962ae43cde4d51e495454a2818eb01 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Tue, 16 Sep 2025 16:24:15 +0800 Subject: [PATCH 16/58] change_build (#22) * [Metax_change_ut] * fix sum&collect_fpn_proposals op register * modify profile * [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel' * [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels * [Metax] con2d_grad use gpudnn * blas handle support * [Metax] register some kernels & update CMakeLists * [Metax] fix metax unittest fail * [Metax] add group_norm & label_smooth kernel and update matmul kernel * [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register * add test * add test * [test] chang the logic of workspace_host in cholesky_kernel_register alloc(cpuplace,size), test pass alloc(cpuplace, size, stream), crash * [Metax] fix compile fail * Revert "[Metax] fix compile fail" This reverts commit 83bc87f686227962b0262e044225c6ed5507b824. * [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h' * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] con2d_grad use gpudnn * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] change_patch * [Metax] update unit test CMakeLists.txt * [Metax] update unit test CMakeLists.txt * [feature] add unique_consecutive kernel * [metax] add some kernel * [metax] add some kernel * [Metax] register baddbmm kernel & update blas api * [Metax] register baddbmm kernel & update blas api * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [feature] add add unique_consecutive kernel.cu * [fix] fix some test case due to missing op register * [fix] fix some fail text * [metax]fix lu eigvalshsqueeze rnn kernel * [metax]fix lu eigvalshsqueeze rnn kernel * add and fix some kernels * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [Metax] fix conflict * [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure * [Metax] update repeat_interleave kernel & ignore max op test * [metax]fix lu eigvalshsqueeze rnn kernel * [metax] chang patch fix copy * [metax] chang patch fix copy * [Metax] update metax_gpu unit test * [Metax] fix test CMakeList.txt * [metax]change_cupti_and_fix_softmax * [metax]change_patch * [metax]change_patch * [metax] updata_qr_kernel * [metax] updata_qr_kernel * [Metax] fix cufft and fix some blas kernel apply * [metax] fix bug * [Metax] add github action * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build --------- Co-authored-by: Mingkun.Zhang <2496808993@qq.com> Co-authored-by: metax666 Co-authored-by: jiaxinWang-metax <189149612@qq.com> Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Co-authored-by: chezhang <1376507468@qq.com> Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com> Co-authored-by: ZhouDuan <1184319564@qq.com> --- backends/metax_gpu/build.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh index a40cac19e19..e3c4304e5f8 100755 --- a/backends/metax_gpu/build.sh +++ b/backends/metax_gpu/build.sh @@ -30,7 +30,6 @@ pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/ # install paddle python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ -# exit 1 unset http_proxy https_proxy From 16f35844e7218d0eb67aaffe6379c2a8820241e7 Mon Sep 17 00:00:00 2001 From: jxwangmetax <189149612@qq.com> Date: Tue, 16 Sep 2025 16:52:30 +0800 Subject: [PATCH 17/58] =?UTF-8?q?=E3=80=90metax=E3=80=91modify=20cmake=20f?= =?UTF-8?q?or=20warpctc=20and=20warprnnt=20(#17)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * modify cmake for warpctc and warprnnt * modify conv for tf32 and fp32 * modify conv kernel --- backends/metax_gpu/CMakeLists.txt | 4 +- backends/metax_gpu/cmake/warpctc.cmake | 7 +- backends/metax_gpu/cmake/warprnnt.cmake | 8 ++- .../fused_conv2d_add_act_kernel_register.cu | 2 +- .../conv_grad_kernel_register.cu | 42 ++++++++++-- .../kernels/gpudnn/conv_kernel_register.cu | 2 +- .../kernels/gpudnn/conv_transpose_kernel.cu | 2 +- backends/metax_gpu/kernels/impl/warpctc.h | 64 ------------------- .../kernels/impl/warpctc_grad_kernel_impl.h | 2 +- .../kernels/impl/warpctc_kernel_impl.h | 16 ++--- backends/metax_gpu/kernels/impl/warprnnt.h | 63 ------------------ .../kernels/impl/warprnnt_kernel_impl.h | 14 ++-- backends/metax_gpu/kernels/metax_context.cc | 20 +++++- backends/metax_gpu/kernels/metax_context.h | 1 + 14 files changed, 88 insertions(+), 159 deletions(-) rename backends/metax_gpu/kernels/{cuda_kernels => gpudnn}/conv_grad_kernel_register.cu (98%) delete mode 100644 backends/metax_gpu/kernels/impl/warpctc.h delete mode 100644 backends/metax_gpu/kernels/impl/warprnnt.h diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index cca23ab42f5..787aae13e40 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -736,7 +736,7 @@ add_library( target_include_directories( ${TARGET_NAME} PRIVATE ${PADDLE_SOURCE_DIR} ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/kernels - ${CUDA_INCLUDE_DIRS} ${PADDLE_SOURCE_DIR}/third_party/pybind/include + ${CUDA_INCLUDE_DIRS} ${WARPCTC_INCLUDE_DIR} ${WARPRNNT_INCLUDE_DIR} ${PADDLE_SOURCE_DIR}/third_party/pybind/include ${PADDLE_SOURCE_DIR}/paddle/phi/api/include/compat) target_link_libraries( @@ -749,6 +749,8 @@ target_link_libraries( protobuf external_error_proto dgc + ${WARPCTC_LIBRARIES} + ${WARPRNNT_LIBRARIES} ${PADDLE_CORE_LIB}) target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so) target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so) diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake index 71c892a6cfa..9edc92f0a94 100644 --- a/backends/metax_gpu/cmake/warpctc.cmake +++ b/backends/metax_gpu/cmake/warpctc.cmake @@ -145,5 +145,8 @@ get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY) include_directories(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its # headers. -add_library(warpctc INTERFACE) -add_dependencies(warpctc extern_warpctc) +add_library(warpctc SHARED IMPORTED GLOBAL) +set_target_properties(warpctc PROPERTIES + IMPORTED_LOCATION ${WARPCTC_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${WARPCTC_INCLUDE_DIR} +) \ No newline at end of file diff --git a/backends/metax_gpu/cmake/warprnnt.cmake b/backends/metax_gpu/cmake/warprnnt.cmake index 54a7ad6be86..527f2e55a1b 100644 --- a/backends/metax_gpu/cmake/warprnnt.cmake +++ b/backends/metax_gpu/cmake/warprnnt.cmake @@ -137,6 +137,8 @@ get_filename_component(WARPRNNT_LIBRARY_PATH ${WARPRNNT_LIBRARIES} DIRECTORY) include_directories(${WARPRNNT_INCLUDE_DIR}) # For warprnnt code to include its # headers. -add_library(warprnnt INTERFACE) -# set_property(TARGET warprnnt PROPERTY IMPORTED_LOCATION ${WARPRNNT_LIBRARIES}) -add_dependencies(warprnnt extern_warprnnt) +add_library(warprnnt SHARED IMPORTED GLOBAL) +set_target_properties(warprnnt PROPERTIES + IMPORTED_LOCATION ${WARPRNNT_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${WARPRNNT_INCLUDE_DIR} +) \ No newline at end of file diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu index ee4f105cbc5..48809ceefa4 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu @@ -308,7 +308,7 @@ class CudnnConvDescManager { int groups, cudnnDataType_t dtype) { auto* desc = new phi::backends::gpu::ConvolutionDescriptor(); - desc->set(dtype, paddings, strides, dilations, true, groups); + desc->set(dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups); return desc; } diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu similarity index 98% rename from backends/metax_gpu/kernels/cuda_kernels/conv_grad_kernel_register.cu rename to backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu index 885137675b4..e4acb2f95b6 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/conv_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu @@ -161,7 +161,12 @@ void ConvCudnnGradKernelImplV7( args1.idesc.set(*transformed_input_grad, layout_tensor); args1.wdesc.set(*transformed_filter_channel, layout_tensor, iwo_groups); args1.odesc.set(*transformed_output_grad_channel, layout_tensor); - args1.cdesc.set(dtype, padding_common, strides, dilations, true, c_groups); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations, + phi::AllowTF32Cudnn(), + c_groups); #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; @@ -184,7 +189,12 @@ void ConvCudnnGradKernelImplV7( args2.wdesc.set( *transformed_filter_grad_channel, layout_tensor, iwo_groups); args2.odesc.set(*transformed_output_grad_channel, layout_tensor); - args2.cdesc.set(dtype, padding_common, strides, dilations, true, c_groups); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations, + phi::AllowTF32Cudnn(), + c_groups); #ifdef PADDLE_WITH_HIP using search2 = SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); @@ -1073,7 +1083,12 @@ void ConvCudnnGradGradKernel( args1.idesc.set(transformed_ddX, iwo_group); args1.wdesc.set(*W, layout, iwo_group); args1.odesc.set(transformed_ddO_channel, iwo_group); - args1.cdesc.set(dtype, padding_common, strides, dilations, true, c_group); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations, + phi::AllowTF32Cudnn(), + c_group); #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; @@ -1092,7 +1107,12 @@ void ConvCudnnGradGradKernel( args2.idesc.set(transformed_X, iwo_group); args2.wdesc.set(*ddW, layout, iwo_group); args2.odesc.set(transformed_ddO_channel, iwo_group); - args2.cdesc.set(dtype, padding_common, strides, dilations, true, c_group); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations, + phi::AllowTF32Cudnn(), + c_group); #ifdef PADDLE_WITH_HIP using search2 = SearchAlgorithm; @@ -1114,7 +1134,12 @@ void ConvCudnnGradGradKernel( args3.idesc.set(transformed_ddX, iwo_group); args3.wdesc.set(*dW, layout, iwo_group); args3.odesc.set(transformed_dO_channel, iwo_group); - args3.cdesc.set(dtype, padding_common, strides, dilations, true, c_group); + args3.cdesc.set(dtype, + padding_common, + strides, + dilations, + phi::AllowTF32Cudnn(), + c_group); #ifdef PADDLE_WITH_HIP using search3 = SearchAlgorithm; @@ -1136,7 +1161,12 @@ void ConvCudnnGradGradKernel( args4.idesc.set(transformed_dX, iwo_group); args4.wdesc.set(*ddW, layout, iwo_group); args4.odesc.set(transformed_dO_channel, iwo_group); - args4.cdesc.set(dtype, padding_common, strides, dilations, true, c_group); + args4.cdesc.set(dtype, + padding_common, + strides, + dilations, + phi::AllowTF32Cudnn(), + c_group); #ifdef PADDLE_WITH_HIP using search4 = SearchAlgorithm; diff --git a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu index bdff5fa9f93..bf129fed05c 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu @@ -81,7 +81,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, args.cdesc.set( dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn(), groups); #else - args.cdesc.set(dtype, padding_common, strides, dilations, true); + args.cdesc.set(dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn()); #endif #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu index aa1cc80d06d..928201c705f 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu @@ -93,7 +93,7 @@ void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x, args.idesc.set(*transformed_out, iwo_groups); args.wdesc.set(*filter, layout_tensor, iwo_groups); args.odesc.set(*transformed_x, iwo_groups); - args.cdesc.set(dtype, padding_common, strides, dilations_, false, c_groups); + args.cdesc.set(dtype, padding_common, strides, dilations_, phi::AllowTF32Cudnn(), c_groups); #ifdef PADDLE_WITH_HIP SearchResult bwd_result; diff --git a/backends/metax_gpu/kernels/impl/warpctc.h b/backends/metax_gpu/kernels/impl/warpctc.h deleted file mode 100644 index ba5da472ade..00000000000 --- a/backends/metax_gpu/kernels/impl/warpctc.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include // NOLINT - -#include "paddle/phi/backends/dynload/dynamic_loader.h" -#include "paddle/phi/common/port.h" -#include "third_party/warpctc/include/ctc.h" - -namespace phi { -namespace dynload { - -extern std::once_flag warpctc_dso_flag; -extern void* warpctc_dso_handle; - -/** - * The following macro definition can generate structs - * (for each function) to dynamic load warpctc routine - * via operator overloading. - */ -#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using warpctcFunc = decltype(&::__name); \ - std::call_once(warpctc_dso_flag, []() { \ - warpctc_dso_handle = phi::dynload::GetWarpCTCDsoHandle(); \ - }); \ - static void* p_##__name = dlsym(warpctc_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern DynLoad__##__name __name - -#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ - DYNAMIC_LOAD_WARPCTC_WRAP(__name) - -#define WARPCTC_ROUTINE_EACH(__macro) \ - __macro(get_warpctc_version); \ - __macro(ctcGetStatusString); \ - __macro(compute_ctc_loss); \ - __macro(compute_ctc_loss_double); \ - __macro(get_workspace_size); \ - __macro(get_workspace_size_double) - -WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP); - -#undef DYNAMIC_LOAD_WARPCTC_WRAP - -} // namespace dynload -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h index 51f4ce86890..dc9bc376e63 100644 --- a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h @@ -16,7 +16,7 @@ #include -#include "kernels/impl/warpctc.h" +#include "third_party/warpctc/include/ctc.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/eigen/common.h" diff --git a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h index 9794ba1b3c0..e0b15feca03 100644 --- a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h @@ -16,7 +16,7 @@ #include -#include "kernels/impl/warpctc.h" +#include "third_party/warpctc/include/ctc.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/tensor_utils.h" @@ -58,7 +58,7 @@ class ComputeCtcLossFunctor { float* costs, void* workspace, ctcOptions options) { - return phi::dynload::compute_ctc_loss(activations, + return compute_ctc_loss(activations, gradients, flat_labels, label_lengths, @@ -84,7 +84,7 @@ class ComputeCtcLossFunctor { double* costs, void* workspace, ctcOptions options) { - return phi::dynload::compute_ctc_loss_double( + return compute_ctc_loss_double( activations, gradients, flat_labels, @@ -141,14 +141,14 @@ class WarpCTCFunctor { ctcStatus_t status = CTC_STATUS_UNKNOWN_ERROR; if (sizeof(T) == 4) { status = - phi::dynload::get_workspace_size(cpu_label_lengths, + get_workspace_size(cpu_label_lengths, cpu_input_lengths, static_cast(sequence_width), static_cast(num_sequences), options_, &workspace_bytes); } else { - status = phi::dynload::get_workspace_size_double( + status = get_workspace_size_double( cpu_label_lengths, cpu_input_lengths, static_cast(sequence_width), @@ -162,7 +162,7 @@ class WarpCTCFunctor { errors::PreconditionNotMet( "warp-ctc [version %d] Error in get_workspace_size: %s", warpctc_version_, - phi::dynload::ctcGetStatusString(status))); + ctcGetStatusString(status))); PADDLE_ENFORCE_GT( workspace_bytes, 0UL, @@ -197,12 +197,12 @@ class WarpCTCFunctor { errors::PreconditionNotMet( "warp-ctc [version %d] Error in get_workspace_size: %s", warpctc_version_, - phi::dynload::ctcGetStatusString(status))); + ctcGetStatusString(status))); } protected: void init(const Context& dev_ctx, const size_t blank) { - warpctc_version_ = phi::dynload::get_warpctc_version(); + warpctc_version_ = get_warpctc_version(); if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) { diff --git a/backends/metax_gpu/kernels/impl/warprnnt.h b/backends/metax_gpu/kernels/impl/warprnnt.h deleted file mode 100644 index 50b0dfc0efc..00000000000 --- a/backends/metax_gpu/kernels/impl/warprnnt.h +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include // NOLINT - -#include "paddle/phi/backends/dynload/dynamic_loader.h" -#include "paddle/phi/common/port.h" -#include "third_party/warprnnt/include/rnnt.h" - -namespace phi { -namespace dynload { - -extern std::once_flag warprnnt_dso_flag; -extern void* warprnnt_dso_handle; - -/** - * The following macro definition can generate structs - * (for each function) to dynamic load warprnnt routine - * via operator overloading. - */ -#define DYNAMIC_LOAD_WARPRNNT_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using warprnntFunc = decltype(&::__name); \ - std::call_once(warprnnt_dso_flag, []() { \ - warprnnt_dso_handle = phi::dynload::GetWarpRNNTDsoHandle(); \ - }); \ - static void* p_##__name = dlsym(warprnnt_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern DynLoad__##__name __name - -#define DECLARE_DYNAMIC_LOAD_WARPRNNT_WRAP(__name) \ - DYNAMIC_LOAD_WARPRNNT_WRAP(__name) - -#define WARPRNNT_ROUTINE_EACH(__macro) \ - __macro(get_warprnnt_version); \ - __macro(rnntGetStatusString); \ - __macro(compute_rnnt_loss); \ - __macro(compute_rnnt_loss_fp64); \ - __macro(get_rnnt_workspace_size); - -WARPRNNT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPRNNT_WRAP); - -#undef DYNAMIC_LOAD_WARPRNNT_WRAP - -} // namespace dynload -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h index bb4311f5912..457fdcb9bff 100644 --- a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h @@ -16,7 +16,7 @@ #include -#include "kernels/impl/warprnnt.h" +#include "third_party/warprnnt/include/rnnt.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/empty_kernel.h" @@ -55,7 +55,7 @@ class ComputeRnntLossFunctor { float* costs, void* workspace, rnntOptions options) { - return phi::dynload::compute_rnnt_loss(activations, + return compute_rnnt_loss(activations, gradients, label, label_lengths, @@ -81,7 +81,7 @@ class ComputeRnntLossFunctor { double* costs, void* workspace, rnntOptions options) { - return phi::dynload::compute_rnnt_loss_fp64(activations, + return compute_rnnt_loss_fp64(activations, gradients, label, label_lengths, @@ -149,7 +149,7 @@ class WarpRNNTFunctor { } size_t workspace_bytes = 0; - status = phi::dynload::get_rnnt_workspace_size( + status = get_rnnt_workspace_size( maxT, maxU, B, gpu, &workspace_bytes, sizeof(T)); PADDLE_ENFORCE_EQ( @@ -158,7 +158,7 @@ class WarpRNNTFunctor { errors::PreconditionNotMet( "warp-rnnt [version %d] Error in get_rnnt_workspace_size: %s", warprnnt_version_, - phi::dynload::rnntGetStatusString(status))); + rnntGetStatusString(status))); PADDLE_ENFORCE_GT( workspace_bytes, 0UL, @@ -190,7 +190,7 @@ class WarpRNNTFunctor { errors::PreconditionNotMet( "warp-rnnt [version %d] Error in get_workspace_size: %s", warprnnt_version_, - phi::dynload::rnntGetStatusString(status))); + rnntGetStatusString(status))); } protected: @@ -200,7 +200,7 @@ class WarpRNNTFunctor { const size_t blank, const float fastemit_lambda, const int num_threads) { - warprnnt_version_ = phi::dynload::get_warprnnt_version(); + warprnnt_version_ = get_warprnnt_version(); options_.maxT = maxT; options_.maxU = maxU; diff --git a/backends/metax_gpu/kernels/metax_context.cc b/backends/metax_gpu/kernels/metax_context.cc index 4df4d88b0b4..f0c92f00565 100644 --- a/backends/metax_gpu/kernels/metax_context.cc +++ b/backends/metax_gpu/kernels/metax_context.cc @@ -15,7 +15,25 @@ #include "kernels/metax_context.h" namespace phi { -bool AllowTF32Cudnn() { return false; } +const bool allow_tf32_cublas = []() -> bool { + const char* v = std::getenv("ALLOW_TF32_CUBLAS"); + if (v) { + return std::atoi(v); + } + return false; +}(); + +const bool allow_tf32_cudnn = []() -> bool { + const char* v = std::getenv("ALLOW_TF32_CUDNN"); + if (v) { + return std::atoi(v); + } + return false; +}(); + +bool AllowTF32Cublas() { return allow_tf32_cublas; } +bool AllowTF32Cudnn() { return allow_tf32_cudnn; } + void DnnWorkspaceHandle::RunFuncSync( const std::function& cudnn_func, size_t required_workspace_bytes, diff --git a/backends/metax_gpu/kernels/metax_context.h b/backends/metax_gpu/kernels/metax_context.h index 5974aadcc41..683a6df7017 100644 --- a/backends/metax_gpu/kernels/metax_context.h +++ b/backends/metax_gpu/kernels/metax_context.h @@ -128,6 +128,7 @@ inline void InitCusolverDnHandle(cusolverDnHandle_t* handle, } } +bool AllowTF32Cublas(); bool AllowTF32Cudnn(); inline cusolverDnHandle_t GetCusolverDnHandle(gpuStream_t stream, Place place) { std::call_once(flag_cusolver_dn_, [&]() { From ce54693240221505b150900fb601e640181a5620 Mon Sep 17 00:00:00 2001 From: jxwangmetax <189149612@qq.com> Date: Tue, 16 Sep 2025 18:12:37 +0800 Subject: [PATCH 18/58] [metax]modify library to static library (#24) * modify cmake for warpctc and warprnnt * modify conv for tf32 and fp32 * modify conv kernel * modify library to static library --- backends/metax_gpu/cmake/warpctc.cmake | 19 +++++++++---------- backends/metax_gpu/cmake/warprnnt.cmake | 19 +++++++++---------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake index 9edc92f0a94..0733c0f9ce5 100644 --- a/backends/metax_gpu/cmake/warpctc.cmake +++ b/backends/metax_gpu/cmake/warpctc.cmake @@ -66,11 +66,11 @@ set(WARPCTC_LIB_DIR if(WIN32) set(WARPCTC_LIBRARIES - "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE) else() set(WARPCTC_LIBRARIES - "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE) endif() @@ -93,10 +93,10 @@ if(WIN32) set(WARPCTC_CXX_FLAGS_DEBUG $) else() - set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS}) + set(WARPCTC_C_FLAGS "${CMAKE_C_FLAGS} -fPIC") set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE}) - set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + set(WARPCTC_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) endif() @@ -127,7 +127,7 @@ ExternalProject_Add( -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA} -DWITH_TORCH=OFF -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON - -DBUILD_SHARED=ON + -DBUILD_SHARED=OFF -DBUILD_TESTS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} @@ -145,8 +145,7 @@ get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY) include_directories(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its # headers. -add_library(warpctc SHARED IMPORTED GLOBAL) -set_target_properties(warpctc PROPERTIES - IMPORTED_LOCATION ${WARPCTC_LIBRARIES} - INTERFACE_INCLUDE_DIRECTORIES ${WARPCTC_INCLUDE_DIR} -) \ No newline at end of file +add_library(warpctc STATIC IMPORTED GLOBAL) +set_target_properties( + warpctc PROPERTIES IMPORTED_LOCATION ${WARPCTC_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${WARPCTC_INCLUDE_DIR}) diff --git a/backends/metax_gpu/cmake/warprnnt.cmake b/backends/metax_gpu/cmake/warprnnt.cmake index 527f2e55a1b..a8d6683af2b 100644 --- a/backends/metax_gpu/cmake/warprnnt.cmake +++ b/backends/metax_gpu/cmake/warprnnt.cmake @@ -62,11 +62,11 @@ set(WARPRNNT_LIB_DIR if(WIN32) set(WARPRNNT_LIBRARIES - "${WARPRNNT_INSTALL_DIR}/bin/warprnnt${CMAKE_SHARED_LIBRARY_SUFFIX}" + "${WARPRNNT_INSTALL_DIR}/bin/warprnnt${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE FILEPATH "Warp-rnnt Library" FORCE) else() set(WARPRNNT_LIBRARIES - "${WARPRNNT_INSTALL_DIR}/lib/libwarprnnt${CMAKE_SHARED_LIBRARY_SUFFIX}" + "${WARPRNNT_INSTALL_DIR}/lib/libwarprnnt${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE FILEPATH "Warp-rnnt Library" FORCE) endif() @@ -90,10 +90,10 @@ if(WIN32) set(WARPRNNT_CXX_FLAGS_DEBUG $) else() - set(WARPRNNT_C_FLAGS ${CMAKE_C_FLAGS}) + set(WARPRNNT_C_FLAGS "${CMAKE_C_FLAGS} -fPIC") set(WARPRNNT_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) set(WARPRNNT_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE}) - set(WARPRNNT_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + set(WARPRNNT_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") set(WARPRNNT_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) set(WARPRNNT_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) endif() @@ -120,7 +120,7 @@ ExternalProject_Add( -DWITH_ROCM=${WITH_ROCM} -DWITH_OMP=${USE_OMP} -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA} - -DBUILD_SHARED=ON + -DBUILD_SHARED=OFF -DBUILD_TESTS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} @@ -137,8 +137,7 @@ get_filename_component(WARPRNNT_LIBRARY_PATH ${WARPRNNT_LIBRARIES} DIRECTORY) include_directories(${WARPRNNT_INCLUDE_DIR}) # For warprnnt code to include its # headers. -add_library(warprnnt SHARED IMPORTED GLOBAL) -set_target_properties(warprnnt PROPERTIES - IMPORTED_LOCATION ${WARPRNNT_LIBRARIES} - INTERFACE_INCLUDE_DIRECTORIES ${WARPRNNT_INCLUDE_DIR} -) \ No newline at end of file +add_library(warprnnt STATIC IMPORTED GLOBAL) +set_target_properties( + warprnnt PROPERTIES IMPORTED_LOCATION ${WARPRNNT_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${WARPRNNT_INCLUDE_DIR}) From 4cda637ff68d88adfd88c322d4d55c9d7dd15397 Mon Sep 17 00:00:00 2001 From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Date: Tue, 16 Sep 2025 18:14:09 +0800 Subject: [PATCH 19/58] [Metax] organize documents (#25) * [Metax] fix dgc & mklml compile product path problem * [Metax] update metax_gpu CMakeLists.txt * [Metax] organize documents --- .../calc_reduced_attn_kernel_register.cu | 2 +- backends/metax_gpu/kernels/funcs/softmax.cu | 2 +- .../kernels/funcs/values_vectors_functor.h | 2 +- .../metax_gpu/kernels/gpudnn/conv_cudnn_v7.h | 2 +- .../conv_transpose_grad_kernel_register.cu | 2 +- .../kernels/gpudnn/pool_kernel_register.cu | 2 +- .../metax_gpu/kernels/gpudnn/softmax_gpudnn.h | 2 +- .../kernels/impl/dirichlet_kernel_impl.h | 2 +- .../addmm_grad_kernel_register.cu | 0 .../addmm_kernel_register.cu | 0 .../batch_fc_grad_kernel_register.cu | 0 .../batch_norm_grad_kernel_register.cu | 2 +- .../batch_norm_kernel_register.cu | 0 .../bilinear_grad_kernel_register.cu | 0 .../bilinear_kernel_register.cu | 0 .../metax_kernel/blha_get_max_len_register.cu | 2 +- .../bmm_grad_kernel_register.cu | 0 .../bmm_kernel_register.cu | 0 ...abel_cross_entropy_grad_kernel_register.cu | 0 .../cholesky_grad_kernel_register.cu | 0 .../metax_kernel/cholesky_kernel_register.cu | 2 +- .../conv_kernel_register.cu | 0 .../conv_transpose_kernel_register.cu | 0 .../crop_kernel_register.cu | 0 .../cross_entropy_kernel_register.cu | 2 +- .../depthwise_conv_grad_kernel.cu | 0 .../depthwise_conv_kernel.cu | 0 .../kernels/{ => metax_kernel}/elementwise.h | 0 .../{ => metax_kernel}/flags_declare.cu | 0 .../flash_attn_grad_kernel.cu | 0 .../{ => metax_kernel}/flash_attn_kernel.cu | 0 .../{ => metax_kernel}/flash_attn_kernel.h | 0 .../{ => metax_kernel}/flash_attn_utils.h | 0 .../kernels/{ => metax_kernel}/flashattn.cc | 0 .../kernels/{ => metax_kernel}/flashattn.h | 0 .../flatten2_grad_kernel_register.cu | 0 .../flatten2_kernel_register.cu | 0 .../fused_conv2d_add_act_kernel_register.cu | 3 +- .../fused_rope_grad_kernel_register.cu | 0 .../fused_rope_kernel_register.cu | 0 .../instance_norm_grad_kerne_registerl.cu | 2 +- .../instance_norm_kernel_register.cu | 2 +- .../layer_norm_grad_kernel_register.cu | 0 .../layer_norm_kernel_register.cu | 0 .../lstm_kernel_register.cu | 0 .../metax_kernel/lu_kernel_register.cu | 2 +- .../lu_solve_grad_kernel_register.cu | 0 .../metax_kernel/matrix_rank_tol_kernel.cu | 2 +- .../{ => metax_kernel}/metax_context.cc | 24 +-- .../{ => metax_kernel}/metax_context.h | 6 +- .../multi_dot_grad_kernel_register.cu | 0 .../multi_dot_kernel_register.cu | 0 .../mv_grad_kernel_register.cu | 0 .../mv_kernel_register.cu | 0 .../metax_kernel/qr_kernel_register.cu | 2 +- .../rank_attention_grad_kernel_register.cu | 0 .../rank_attention_kernel_register.cu | 0 .../metax_kernel/rnn_grad_kernel.cu.cc | 2 +- .../kernels/metax_kernel/rnn_kernel.cu.cc | 2 +- .../slogdeterminant_kernel_register.cu | 0 .../softmax_kernel_grad_register.cu | 0 .../softmax_kernel_register.cu | 0 .../solve_grad_kernel_register.cu | 0 .../standard_gamma_kernel_register.cu | 0 .../stft_kernel_register.cu | 0 .../svd_kernel_register.cu | 0 .../top_k_grad_kernel_register.cu | 0 .../triangular_solve_grad_kernel_register.cu | 0 .../triangular_solve_kernel_register.cu | 0 .../warprnnt_kernel_register.cu | 0 .../weight_only_linear_kernel.cu | 0 .../weight_quantize_kernel_register.cu | 0 backends/metax_gpu/patch/paddle.patch | 204 +++++++++--------- backends/metax_gpu/tests/CMakeLists.txt | 54 ++--- 74 files changed, 166 insertions(+), 163 deletions(-) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/addmm_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/addmm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/batch_fc_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/batch_norm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bilinear_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bilinear_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bmm_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bmm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/cholesky_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/conv_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/conv_transpose_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/crop_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/depthwise_conv_grad_kernel.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/depthwise_conv_kernel.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/elementwise.h (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flags_declare.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_grad_kernel.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_kernel.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_kernel.h (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_utils.h (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flashattn.cc (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flashattn.h (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/flatten2_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/flatten2_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_conv2d_add_act_kernel_register.cu (99%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_rope_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_rope_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/layer_norm_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/layer_norm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/lstm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/lu_solve_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/metax_context.cc (90%) rename backends/metax_gpu/kernels/{ => metax_kernel}/metax_context.h (96%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/multi_dot_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/multi_dot_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/mv_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/mv_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/rank_attention_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/rank_attention_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/slogdeterminant_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/softmax_kernel_grad_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/softmax_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/solve_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/standard_gamma_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/stft_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/svd_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/top_k_grad_kernel_register.cu (100%) mode change 100755 => 100644 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/triangular_solve_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/triangular_solve_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/warprnnt_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/weight_only_linear_kernel.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/weight_quantize_kernel_register.cu (100%) diff --git a/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu index 11def2c9ee4..2aa8424f0b1 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/flash_attn_utils.h" +#include "kernels/metax_kernel/flash_attn_utils.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/calc_reduced_attn_kernel.h" diff --git a/backends/metax_gpu/kernels/funcs/softmax.cu b/backends/metax_gpu/kernels/funcs/softmax.cu index d738a53f43a..44bfd02a308 100644 --- a/backends/metax_gpu/kernels/funcs/softmax.cu +++ b/backends/metax_gpu/kernels/funcs/softmax.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/backends/metax_gpu/kernels/funcs/values_vectors_functor.h b/backends/metax_gpu/kernels/funcs/values_vectors_functor.h index ec429950872..8c5996e680b 100644 --- a/backends/metax_gpu/kernels/funcs/values_vectors_functor.h +++ b/backends/metax_gpu/kernels/funcs/values_vectors_functor.h @@ -24,7 +24,7 @@ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/common/errors.h" #endif -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" diff --git a/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h b/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h index da61a1e5b41..a0f89047045 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h +++ b/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include "glog/logging.h" -#include "metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h" #include "paddle/phi/kernels/autotune/switch_autotune.h" #include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h" diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu index 0067818d165..b7eebfcee2e 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "kernels/gpudnn/conv_cudnn_v7.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/common/ddim.h" #include "paddle/phi/backends/context_pool.h" #include "paddle/phi/backends/dynload/cudnn.h" diff --git a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu index c115f5ad930..1c2bfeedf34 100644 --- a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "gpudnn/pool_gpudnn.h" -#include "metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/full_kernel.h" diff --git a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h index 168752700e9..5844886ad1b 100644 --- a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h +++ b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h @@ -25,7 +25,7 @@ #include "paddle/phi/kernels/primitive/kernel_primitives.h" // See Note [ Why still include the fluid headers? ] -#include "metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_dnn.h" diff --git a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h index 70af87513e5..c2e2e341bf5 100644 --- a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h @@ -17,7 +17,7 @@ #include #include -#include "kernels/elementwise.h" +#include "kernels/metax_kernel/elementwise.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/amp_type_traits.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/addmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/addmm_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/addmm_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/addmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/addmm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_fc_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/batch_fc_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu index 062646bbf9d..52fe5a1d566 100644 --- a/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "glog/logging.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/common/flags.h" #include "paddle/common/layout.h" #include "paddle/phi/backends/gpu/gpu_context.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_norm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/batch_norm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/bilinear_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bilinear_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/bilinear_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/bilinear_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/bilinear_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bilinear_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/bilinear_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/bilinear_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu b/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu index bc9eb23c0e8..42810569fde 100644 --- a/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/flash_attn_utils.h" #include "kernels/metax_kernel/block_attn.h" +#include "kernels/metax_kernel/flash_attn_utils.h" #include "paddle/phi/backends/context_pool.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/bmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bmm_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/bmm_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/bmm_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/bmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bmm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/bmm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/bmm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/cholesky_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cholesky_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/cholesky_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/cholesky_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu index e8fae2d9da5..8a39ae3f0a8 100644 --- a/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu @@ -20,7 +20,7 @@ limitations under the License. */ #include #include -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/conv_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/conv_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/conv_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/conv_transpose_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/conv_transpose_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/conv_transpose_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/crop_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/crop_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/crop_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/crop_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu index e94862ec7b0..043a64dc149 100644 --- a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "glog/logging.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/kernels/cross_entropy_kernel.h" #include "paddle/phi/kernels/full_kernel.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_grad_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu diff --git a/backends/metax_gpu/kernels/elementwise.h b/backends/metax_gpu/kernels/metax_kernel/elementwise.h similarity index 100% rename from backends/metax_gpu/kernels/elementwise.h rename to backends/metax_gpu/kernels/metax_kernel/elementwise.h diff --git a/backends/metax_gpu/kernels/flags_declare.cu b/backends/metax_gpu/kernels/metax_kernel/flags_declare.cu similarity index 100% rename from backends/metax_gpu/kernels/flags_declare.cu rename to backends/metax_gpu/kernels/metax_kernel/flags_declare.cu diff --git a/backends/metax_gpu/kernels/flash_attn_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/flash_attn_grad_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/flash_attn_grad_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_grad_kernel.cu diff --git a/backends/metax_gpu/kernels/flash_attn_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/flash_attn_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.cu diff --git a/backends/metax_gpu/kernels/flash_attn_kernel.h b/backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.h similarity index 100% rename from backends/metax_gpu/kernels/flash_attn_kernel.h rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.h diff --git a/backends/metax_gpu/kernels/flash_attn_utils.h b/backends/metax_gpu/kernels/metax_kernel/flash_attn_utils.h similarity index 100% rename from backends/metax_gpu/kernels/flash_attn_utils.h rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_utils.h diff --git a/backends/metax_gpu/kernels/flashattn.cc b/backends/metax_gpu/kernels/metax_kernel/flashattn.cc similarity index 100% rename from backends/metax_gpu/kernels/flashattn.cc rename to backends/metax_gpu/kernels/metax_kernel/flashattn.cc diff --git a/backends/metax_gpu/kernels/flashattn.h b/backends/metax_gpu/kernels/metax_kernel/flashattn.h similarity index 100% rename from backends/metax_gpu/kernels/flashattn.h rename to backends/metax_gpu/kernels/metax_kernel/flashattn.h diff --git a/backends/metax_gpu/kernels/cuda_kernels/flatten2_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/flatten2_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/flatten2_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/flatten2_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/flatten2_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/flatten2_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/flatten2_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/flatten2_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu similarity index 99% rename from backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu index 48809ceefa4..c0d15b7f1b4 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu @@ -308,7 +308,8 @@ class CudnnConvDescManager { int groups, cudnnDataType_t dtype) { auto* desc = new phi::backends::gpu::ConvolutionDescriptor(); - desc->set(dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups); + desc->set( + dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups); return desc; } diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu index d7540d949a9..bdf341f5a35 100644 --- a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu +++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "glog/logging.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/common/layout.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu index db975d74665..e0c0ae9c1d6 100644 --- a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "glog/logging.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/common/layout.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/backends/metax_gpu/kernels/layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/layer_norm_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/layer_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/layer_norm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/layer_norm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/lstm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lstm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/lstm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/lstm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu index 5a2d85418a1..72e4c5b2b79 100644 --- a/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu @@ -18,7 +18,7 @@ #include "paddle/phi/backends/dynload/cusolver.h" #endif -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/enforce.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/lu_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lu_solve_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/lu_solve_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/lu_solve_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu index bda5dc62f1a..d8c3355e6e4 100644 --- a/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu +++ b/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu @@ -18,7 +18,7 @@ #include #include -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/type_traits.h" diff --git a/backends/metax_gpu/kernels/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc similarity index 90% rename from backends/metax_gpu/kernels/metax_context.cc rename to backends/metax_gpu/kernels/metax_kernel/metax_context.cc index f0c92f00565..62aaa5fb2de 100644 --- a/backends/metax_gpu/kernels/metax_context.cc +++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc @@ -12,27 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" namespace phi { const bool allow_tf32_cublas = []() -> bool { - const char* v = std::getenv("ALLOW_TF32_CUBLAS"); - if (v) { - return std::atoi(v); - } - return false; + const char* v = std::getenv("ALLOW_TF32_CUBLAS"); + if (v) { + return std::atoi(v); + } + return false; }(); const bool allow_tf32_cudnn = []() -> bool { - const char* v = std::getenv("ALLOW_TF32_CUDNN"); - if (v) { - return std::atoi(v); - } - return false; + const char* v = std::getenv("ALLOW_TF32_CUDNN"); + if (v) { + return std::atoi(v); + } + return false; }(); bool AllowTF32Cublas() { return allow_tf32_cublas; } -bool AllowTF32Cudnn() { return allow_tf32_cudnn; } +bool AllowTF32Cudnn() { return allow_tf32_cudnn; } void DnnWorkspaceHandle::RunFuncSync( const std::function& cudnn_func, diff --git a/backends/metax_gpu/kernels/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h similarity index 96% rename from backends/metax_gpu/kernels/metax_context.h rename to backends/metax_gpu/kernels/metax_kernel/metax_context.h index 683a6df7017..a6610c1dab2 100644 --- a/backends/metax_gpu/kernels/metax_context.h +++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#ifndef BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_ -#define BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_ +#ifndef BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_ +#define BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_ #include #include #include @@ -161,4 +161,4 @@ inline DnnWorkspaceHandle GetDnnWorkspace(Allocator* alloactor, return DnnWorkspaceHandle(alloactor, stream); } } // namespace phi -#endif // BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_ +#endif // BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_ diff --git a/backends/metax_gpu/kernels/cuda_kernels/multi_dot_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/multi_dot_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/multi_dot_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/multi_dot_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/multi_dot_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/multi_dot_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/multi_dot_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/multi_dot_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/mv_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/mv_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/mv_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/mv_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/mv_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu index 745069e2eda..c3041254444 100644 --- a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu @@ -22,7 +22,7 @@ #include #include -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/enforce.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/rank_attention_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/rank_attention_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/rank_attention_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/rank_attention_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc index 499832049e4..101b51aa350 100644 --- a/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc +++ b/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc @@ -14,7 +14,7 @@ #include "paddle/phi/kernels/rnn_grad_kernel.h" -#include "kernels/metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc index f1cf9e09dc7..2598ce093e6 100644 --- a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc +++ b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc @@ -15,7 +15,7 @@ #include "paddle/phi/kernels/rnn_kernel.h" #include "glog/logging.h" -#include "kernels/metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/generator.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/slogdeterminant_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/slogdeterminant_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_grad_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_grad_register.cu rename to backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/solve_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/solve_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/solve_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/standard_gamma_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/standard_gamma_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/standard_gamma_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/standard_gamma_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/stft_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/stft_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/stft_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/stft_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/svd_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/svd_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/top_k_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/top_k_grad_kernel_register.cu old mode 100755 new mode 100644 similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/top_k_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/top_k_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/triangular_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/triangular_solve_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/triangular_solve_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/triangular_solve_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/triangular_solve_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/warprnnt_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/warprnnt_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/warprnnt_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/warprnnt_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/weight_only_linear_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/weight_only_linear_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/weight_quantize_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index 0283a443adb..e56826c4f3e 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -16,16 +16,16 @@ index cfada544d4..a690e97d74 100644 - set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header}) + # set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header}) endif() - + set(EIGEN_INCLUDE_DIR ${SOURCE_DIR}) diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc index bff0f2bf70..9376b5781f 100644 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc @@ -16,7 +16,7 @@ - + #include - + -#include "paddle/fluid/platform/enforce.h" +// #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/core/os_info.h" @@ -76,7 +76,7 @@ index c0080f0a5e..458ca3e2e8 100644 + __macro(cudnnDestroyActivationDescriptor); \ + __macro(cudnnSetRNNDescriptor_v6); CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) - + #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000 @@ -152,7 +161,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \ @@ -91,11 +91,11 @@ index c0080f0a5e..458ca3e2e8 100644 + __macro(cudnnRNNForwardInferenceEx); CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif - + @@ -195,40 +209,6 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif - + -#if CUDNN_VERSION < 90000 -#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \ - __macro(cudnnGetRNNParamsSize); \ @@ -132,15 +132,15 @@ index c0080f0a5e..458ca3e2e8 100644 -#endif } // namespace dynload } // namespace phi - + diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h index 1547909d92..66b2779392 100644 --- a/paddle/phi/backends/dynload/cufft.h +++ b/paddle/phi/backends/dynload/cufft.h @@ -1,3 +1,4 @@ -+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. ++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - + Licensed under the Apache License, Version 2.0 (the "License"); @@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name); cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle(); \ @@ -160,23 +160,23 @@ index 59e92955c9..d2f8c2da15 100644 @@ -24,8 +24,8 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/dynamic_loader.h" #include "paddle/phi/common/port.h" - + -namespace phi { -namespace dynload { +// namespace phi { +// namespace dynload { - + extern std::once_flag cupti_dso_flag; extern void *cupti_dso_handle; @@ -71,7 +71,7 @@ extern void *cupti_dso_handle; CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP); - + #undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP -} // namespace dynload -} // namespace phi +// } // namespace dynload +// } // namespace phi - + -#endif // PADDLE_WITH_CUPTI +#endif // PADDLE_WITH_CUPTI \ No newline at end of file @@ -230,28 +230,28 @@ index 4ff2e528a9..81421c8ca1 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h @@ -1,3 +1,4 @@ -+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. ++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - + Licensed under the Apache License, Version 2.0 (the "License"); @@ -25,7 +26,7 @@ namespace phi { namespace backends { namespace gpu { - + -#define FULL_WARP_MASK 0xFFFFFFFF +#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL #define CREATE_SHFL_MASK(mask, predicate) \ mask = __ballot_sync(FULL_WARP_MASK, (predicate)) - + @@ -45,12 +46,12 @@ namespace gpu { - + template __forceinline__ __device__ T -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { +CudaShuffleDownSync(unsigned long long mask, T val, int delta, int width = warpSize) { return __shfl_down_sync(mask, val, static_cast(delta), width); } - + template -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, +__forceinline__ __device__ T CudaShuffleXorSync(unsigned long long mask, @@ -259,7 +259,7 @@ index 4ff2e528a9..81421c8ca1 100644 int width = warpSize) { return __shfl_xor_sync(mask, val, width); @@ -58,14 +59,14 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, - + template <> __forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync( - unsigned mask, phi::dtype::float16 val, int delta, int width) { @@ -267,7 +267,7 @@ index 4ff2e528a9..81421c8ca1 100644 return phi::dtype::float16(__shfl_down_sync( mask, val.to_half(), static_cast(delta), width)); } - + template <> __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( - unsigned mask, phi::dtype::bfloat16 val, int delta, int width) { @@ -276,7 +276,7 @@ index 4ff2e528a9..81421c8ca1 100644 return phi::dtype::bfloat16(__shfl_down_sync( mask, val.to_nv_bfloat16(), static_cast(delta), width)); @@ -77,7 +78,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( - + template <> __forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - unsigned mask, phi::dtype::complex val, int delta, int width) { @@ -285,7 +285,7 @@ index 4ff2e528a9..81421c8ca1 100644 mask, static_cast(val.real), static_cast(delta), width)); float imag = static_cast(__shfl_down_sync( @@ -87,7 +88,7 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - + template <> __forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - unsigned mask, phi::dtype::complex val, int delta, int width) { @@ -294,14 +294,14 @@ index 4ff2e528a9..81421c8ca1 100644 static_cast(__shfl_down_sync(mask, static_cast(val.real), @@ -103,13 +104,13 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - + template <> __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync( - unsigned mask, phi::dtype::float16 val, int width) { + unsigned long long mask, phi::dtype::float16 val, int width) { return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width)); } - + template <> __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( - unsigned mask, phi::dtype::bfloat16 val, int width) { @@ -310,7 +310,7 @@ index 4ff2e528a9..81421c8ca1 100644 return phi::dtype::bfloat16( __shfl_xor_sync(mask, val.to_nv_bfloat16(), width)); @@ -121,7 +122,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( - + template <> __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - unsigned mask, phi::dtype::complex val, int width) { @@ -319,7 +319,7 @@ index 4ff2e528a9..81421c8ca1 100644 __shfl_xor_sync(mask, static_cast(val.real), width)); float imag = static_cast( @@ -131,7 +132,7 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - + template <> __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - unsigned mask, phi::dtype::complex val, int width) { @@ -328,14 +328,14 @@ index 4ff2e528a9..81421c8ca1 100644 __shfl_xor_sync(mask, static_cast(val.real), width)); double imag = static_cast( @@ -141,7 +142,7 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - + template __forceinline__ __device__ T -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { +CudaShuffleSync(unsigned long long mask, T val, int src_line, int width = 32) { return __shfl_sync(mask, val, src_line, width); } - + @@ -160,7 +161,7 @@ __device__ T reduceSum(T val, int tid, int len) { // but most card's warp size is 32. const int warpSize = 32; @@ -343,7 +343,7 @@ index 4ff2e528a9..81421c8ca1 100644 - unsigned mask = 0u; + unsigned long long mask = 0ull; CREATE_SHFL_MASK(mask, tid < len); - + for (int offset = warpSize / 2; offset > 0; offset /= 2) diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h index 024a7de73e..1e4cdf16be 100644 @@ -351,7 +351,7 @@ index 024a7de73e..1e4cdf16be 100644 +++ b/paddle/phi/core/enforce.h @@ -45,7 +45,9 @@ limitations under the License. */ #endif - + #ifdef PADDLE_WITH_CUDA -#include "paddle/phi/backends/dynload/cublas.h" +// #include "paddle/phi/backends/dynload/../../../../../cublas.h" @@ -361,9 +361,9 @@ index 024a7de73e..1e4cdf16be 100644 #include "paddle/phi/backends/dynload/curand.h" #include "paddle/phi/backends/dynload/cusolver.h" @@ -97,7 +99,7 @@ inline bool is_error(bool stat) { return !stat; } - + void ThrowWarnInternal(const std::string& message); - + -#if defined(__CUDA_ARCH__) +#if defined(__CUDACC__) // For cuda, the assertions can affect performance and it is therefore @@ -379,7 +379,7 @@ index 024a7de73e..1e4cdf16be 100644 } while (0) #elif defined(__HIPCC__) @@ -757,4 +759,4 @@ inline void retry_sleep(unsigned millisecond) { - + } // namespace enforce using namespace enforce; // NOLINT -} // namespace phi @@ -392,7 +392,7 @@ index c646e487d0..325122175c 100644 @@ -25,8 +25,9 @@ #else #include - + -#include "paddle/phi/backends/dynload/cublas.h" -#include "paddle/phi/backends/dynload/cublasLt.h" +// #include "paddle/phi/backends/dynload/cublas.h" @@ -400,16 +400,16 @@ index c646e487d0..325122175c 100644 +// #include "paddle/phi/backends/dynload/cublasLt.h" #include "paddle/phi/backends/dynload/cudnn.h" #endif - + @@ -90,7 +91,7 @@ DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode, - + // TODO(Ming Huang): Since there is no blasLt handler, // use rocblas_handle for workaround. -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); +// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); - + #undef DECLARE_TYPE_FOR_GPU - + diff --git a/paddle/phi/core/platform/device_context.h b/paddle/phi/core/platform/device_context.h index 2d02eb370b..8a7233e34e 100644 --- a/paddle/phi/core/platform/device_context.h @@ -430,58 +430,58 @@ index d69eb67d6f..1d8b6e9375 100644 --- a/paddle/phi/kernels/cpu/index_select_impl.h +++ b/paddle/phi/kernels/cpu/index_select_impl.h @@ -18,7 +18,7 @@ - + #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" - + diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu index cb35feee32..64f5bd24ac 100644 --- a/paddle/phi/kernels/funcs/fc_functor.cu +++ b/paddle/phi/kernels/funcs/fc_functor.cu @@ -16,12 +16,12 @@ limitations under the License. */ - + #include "paddle/phi/backends/all_context.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/fc_functor.h" - + #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h" +// #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h" #include "paddle/phi/kernels/funcs/quant_dequant.h" #include "paddle/phi/kernels/matmul_kernel.h" - + diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu index 88663ec880..98b93072a3 100644 --- a/paddle/phi/kernels/funcs/gru_compute.cu +++ b/paddle/phi/kernels/funcs/gru_compute.cu @@ -12,7 +12,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/gru_compute.h" - + #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h" #include "paddle/phi/kernels/funcs/detail/gru_kernel.h" - + diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h index 15e1a4a3c3..e4780538d7 100644 --- a/paddle/phi/kernels/funcs/math/context_project.h +++ b/paddle/phi/kernels/funcs/math/context_project.h @@ -18,7 +18,7 @@ #include - + #include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/im2col.h" - + namespace phi { diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu index e101224970..a52eb6096f 100644 @@ -489,14 +489,14 @@ index e101224970..a52eb6096f 100644 +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu @@ -15,11 +15,13 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/matrix_inverse.h" - + #include "paddle/phi/common/memory_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" - + namespace phi { namespace funcs { - + + + template @@ -514,19 +514,19 @@ index 558d363b39..05da04b517 100644 +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/scatter.cu.h" - + diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu index 8b0baf5f5f..260482f124 100644 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu @@ -27,7 +27,7 @@ namespace cub = hipcub; - + #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h" - + -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_cuda_utils.h" - + namespace phi { diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h index e30d440ff3..3c74792690 100644 @@ -535,7 +535,7 @@ index e30d440ff3..3c74792690 100644 @@ -30,11 +30,11 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/primitive/functor_primitives.h" - + -#define FINAL_MASK 0xffffffff +#define FINAL_MASK 0xffffffffffffffffull #ifdef PADDLE_WITH_HIP @@ -545,7 +545,7 @@ index e30d440ff3..3c74792690 100644 +#define WARP_SIZE 64 #endif #define MAX_NUM_THREADS 1024 - + @@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair topk[], for (int k = beam_size - 2; k >= 0; k--) { if (largest) { @@ -606,7 +606,7 @@ index e30d440ff3..3c74792690 100644 + topk[0 + offset].v = p.v; + topk[0 + offset].id = p.id; } - + template @@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair topk[], template @@ -662,7 +662,7 @@ index e30d440ff3..3c74792690 100644 + // topk + MaxLength - *beam, src, tid, dim, *max, length, largest); } } - + @@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair shared_max[], shared_max[wid] = input_now; } @@ -697,7 +697,7 @@ index e30d440ff3..3c74792690 100644 - if (--(*k) == 0) break; + // if (--(*k) == 0) break; + unsigned long long mask = 0ull; - + - unsigned mask = 0u; + // unsigned mask = 0u; CREATE_SHFL_MASK(mask, true); @@ -721,14 +721,14 @@ index e30d440ff3..3c74792690 100644 + return ret; } - + static __device__ __forceinline__ unsigned int SetBitfield( unsigned int val, unsigned int to_insert, int pos, int len) { unsigned int ret; - asm("bfi.b32 %0, %1, %2, %3, %4;" - : "=r"(ret) - : "r"(to_insert), "r"(val), "r"(pos), "r"(len)); -+ ++ + ret = (static_cast(val) << (32 - pos - len)) >> (32 - len); return ret; } @@ -738,12 +738,12 @@ index e30d440ff3..3c74792690 100644 int len) { uint64_t ret; - asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len)); -+ ++ + + ret = (static_cast(val) << (64 - pos - len)) >> (64 - len); return ret; } - + @@ -507,9 +556,9 @@ struct Bitfield { int pos, int len) { @@ -751,7 +751,7 @@ index e30d440ff3..3c74792690 100644 - asm("bfi.b64 %0, %1, %2, %3, %4;" - : "=l"(ret) - : "l"(to_insert), "l"(val), "r"(pos), "r"(len)); -+ ++ + ret = (static_cast(val) << (64 - pos - len)) >> (64 - len); + return ret; @@ -763,7 +763,7 @@ index e30d440ff3..3c74792690 100644 int lane_id; - asm("mov.s32 %0, %%laneid;" : "=r"(lane_id)); - return lane_id; -+ ++ +// // >>>> PTX2CPP Success <<<< +// { +// (lane_id)=(threadIdx.x&(warpSize-1)); @@ -771,7 +771,7 @@ index e30d440ff3..3c74792690 100644 + return ::__lane_id(); + // return lane_id; } - + __device__ __forceinline__ unsigned GetLaneMaskLe() { unsigned mask; - asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask)); @@ -780,17 +780,17 @@ index e30d440ff3..3c74792690 100644 + return ((uint64_t(1) << ::__lane_id()) << 1) - 1; + // return mask; } - + template @@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input, - + // 1. Find the k-th value T kth_value = static_cast(0); - RadixSearch::RadixType, IndexType, false>( + // RadixSearch::RadixType, IndexType, false>( + RadixSearch::RadixType, IndexType, false>( cur_input, k, num_cols, shared_mem, &kth_value); - + __shared__ int64_t block_min_idx; @@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx, } @@ -803,12 +803,12 @@ index 32db61532f..0220316bc3 100644 +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h @@ -15,7 +15,7 @@ #pragma once - + #if defined(PADDLE_WITH_CUDA) -#include "paddle/phi/backends/dynload/cublasLt.h" +// #include "paddle/phi/backends/dynload/cublasLt.h" #endif - + #include "glog/logging.h" diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h index 9d4bb18d55..ea42cc10a9 100644 @@ -830,12 +830,12 @@ index b8cfdbf3ce..fa14b94a77 100644 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu @@ -14,7 +14,7 @@ - + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h" +#include "kernels/metax_kernel/mmha_util.cu.h" - + namespace phi { namespace fusion { diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu @@ -843,12 +843,12 @@ index e838778952..83e805e75a 100644 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu @@ -14,7 +14,7 @@ - + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h" +#include "kernels/metax_kernel/mmha_util.cu.h" - + namespace phi { namespace fusion { diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h @@ -863,7 +863,7 @@ index f0cca0f701..02ea957240 100644 -#include "paddle/phi/kernels/impl/conv_cudnn_impl.h" +#include "kernels/gpudnn/conv_gpudnn.h" +#include "kernels/impl/conv_cudnn_impl.h" - + namespace phi { // To determine use cudnn or not. diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h @@ -890,7 +890,7 @@ index 29fa252e96..4ae72b0935 100644 +// #endif return tanhf(x); } - + diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu index 11efd87965..679db14c24 100644 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu @@ -901,9 +901,9 @@ index 11efd87965..679db14c24 100644 #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" +#include "kernels/gpudnn/softmax_gpudnn.h" - + namespace phi { - + diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu index 63c35dd4ee..15da9aea45 100644 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu @@ -914,9 +914,9 @@ index 63c35dd4ee..15da9aea45 100644 #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" +#include "kernels/gpudnn/softmax_gpudnn.h" - + namespace phi { - + diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu index 1bdbe1564c..f753b54bc6 100644 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu @@ -948,7 +948,7 @@ index cf80666b4e..ca76e055fb 100644 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h @@ -19,7 +19,7 @@ limitations under the License. */ - + #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/baddbmm_grad_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" @@ -961,14 +961,14 @@ index 2789cb59a2..b91b076f7f 100644 --- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h +++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h @@ -20,7 +20,7 @@ limitations under the License. */ - + #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/baddbmm_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" - + diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h index 9a21c23666..86413d1577 100644 --- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h @@ -993,7 +993,7 @@ index 4459a931da..837c8682b8 100644 -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" - + namespace phi { diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h index ad9e9197dd..5478d9817d 100644 @@ -1013,27 +1013,27 @@ index e6b3960f6d..564125f1f6 100644 --- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h +++ b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h @@ -56,8 +56,8 @@ HOSTDEVICE T igam(const T a, const T x) { - + template HOSTDEVICE T igamc(const T a, const T x) { - static T big = 4.503599627370496e15; - static T biginv = 2.22044604925031308085e-16; + const static T big = 4.503599627370496e15; + const static T biginv = 2.22044604925031308085e-16; - + if ((x <= T{0}) || (a <= T{0})) return (T{1.0}); - + diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h index 410fb3c560..009ce03440 100644 --- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h @@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) { - + template HOSTDEVICE T digamma(T x) { - static T pi = T{3.14159265358979323846}; + const static T pi = T{3.14159265358979323846}; - + if (x == T{0.0}) { T inf = std::numeric_limits::infinity(); diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h @@ -1048,12 +1048,12 @@ index 5ebbc8d2db..48acf8d0cd 100644 -#include "paddle/phi/kernels/funcs/quant_dequant.h" +#include "kernels/funcs/blas/cublaslt.h" +#include "kernels/funcs/quant_dequant.h" -+#include "kernels/metax_context.h" - ++#include "kernels/metax_kernel/metax_context.h" + #pragma once - + @@ -668,7 +669,7 @@ void LLMGemm(const phi::GPUContext& dev_ctx, - + { auto helper = - std::make_unique(m, k, n, dev_ctx.cublaslt_handle()); @@ -1067,12 +1067,12 @@ index 1f319c4ae3..9186eb6906 100644 +++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once - + #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" - + namespace phi { diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h index 6f03f76eeb..5fe2c3e7dc 100644 @@ -1080,13 +1080,13 @@ index 6f03f76eeb..5fe2c3e7dc 100644 +++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once - + #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" - + diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h index 7b85903776..3f4b298807 100644 --- a/paddle/phi/kernels/impl/merged_momentum_impl.h @@ -1118,14 +1118,14 @@ index 4099d8b506..baef2cd643 100644 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h +++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h @@ -14,7 +14,7 @@ - + #pragma once - + -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" - + diff --git a/third_party/flagcx b/third_party/flagcx index 7c469f4af9..7e6c4cc3ca 160000 --- a/third_party/flagcx diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt index 37475773026..410ef006514 100755 --- a/backends/metax_gpu/tests/CMakeLists.txt +++ b/backends/metax_gpu/tests/CMakeLists.txt @@ -87,32 +87,34 @@ list( list( REMOVE_ITEM PYTHON_TEST_SCRIPTS - ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py # 精度问题 - ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py # 受 test_sum_op.py 影响 - ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py # 精度问题 - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py # core.cudnnversion - # 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py # core.cudnnversion 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py # core.cudnnversion 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py # op_test.py 里 - # self._get_places() - # 接口适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py # device == "gpu" 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py # paddle-gpu 报错一致 - ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py # paddle-gpu 报错一致 - ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py # core.cudnnversion 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py # paddle-gpu 报错一致 - ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py # paddle-gpu 报错一致 - ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py # paddle.device.cuda.get_device_properties - ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py # needs check_grad with fp64 - # precision - ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py # op_test.py 里 - # self._get_places() 接口适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py # CUDAPinnedPlace 问题 - ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py # paddle.device.cuda.get_device_properties - ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py # CUDAPinnedPlace 问题 - ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py # paddle.device.cuda.get_device_properties -) + # 精度问题 + ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py + # core.cudnnversion + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py + # op_test.py 里 self._get_places()接口适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py + # device == "gpu" 适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py + # paddle-gpu 报错一致 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py + # paddle.device.cuda.get_device_properties + ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py + # needs check_grad with fp64 precision + ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py + # CUDAPinnedPlace 问题 + ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py) list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS) foreach(test_script ${PYTHON_TEST_SCRIPTS}) From 23fca59cd47c30680a01e9ec79f5d4d16d156320 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Wed, 17 Sep 2025 10:44:44 +0800 Subject: [PATCH 20/58] [metax]fix_code style and index_elementwise_put_kernel (#27) * [Metax_change_ut] * fix sum&collect_fpn_proposals op register * modify profile * [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel' * [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels * [Metax] con2d_grad use gpudnn * blas handle support * [Metax] register some kernels & update CMakeLists * [Metax] fix metax unittest fail * [Metax] add group_norm & label_smooth kernel and update matmul kernel * [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register * add test * add test * [test] chang the logic of workspace_host in cholesky_kernel_register alloc(cpuplace,size), test pass alloc(cpuplace, size, stream), crash * [Metax] fix compile fail * Revert "[Metax] fix compile fail" This reverts commit 83bc87f686227962b0262e044225c6ed5507b824. * [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h' * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] con2d_grad use gpudnn * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] change_patch * [Metax] update unit test CMakeLists.txt * [Metax] update unit test CMakeLists.txt * [feature] add unique_consecutive kernel * [metax] add some kernel * [metax] add some kernel * [Metax] register baddbmm kernel & update blas api * [Metax] register baddbmm kernel & update blas api * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [feature] add add unique_consecutive kernel.cu * [fix] fix some test case due to missing op register * [fix] fix some fail text * [metax]fix lu eigvalshsqueeze rnn kernel * [metax]fix lu eigvalshsqueeze rnn kernel * add and fix some kernels * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [Metax] fix conflict * [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure * [Metax] update repeat_interleave kernel & ignore max op test * [metax]fix lu eigvalshsqueeze rnn kernel * [metax] chang patch fix copy * [metax] chang patch fix copy * [Metax] update metax_gpu unit test * [Metax] fix test CMakeList.txt * [metax]change_cupti_and_fix_softmax * [metax]change_patch * [metax]change_patch * [metax] updata_qr_kernel * [metax] updata_qr_kernel * [Metax] fix cufft and fix some blas kernel apply * [metax] fix bug * [Metax] add github action * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]fix_code style and index_elementwise_put_kernel --------- Co-authored-by: Mingkun.Zhang <2496808993@qq.com> Co-authored-by: metax666 Co-authored-by: jiaxinWang-metax <189149612@qq.com> Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Co-authored-by: chezhang <1376507468@qq.com> Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com> Co-authored-by: ZhouDuan <1184319564@qq.com> --- backends/metax_gpu/CMakeLists.txt | 15 +++-- ...ex_elementwise_put_grad_kernel_register.cu | 18 ++++- .../index_elementwise_put_kernel_register.cu | 18 ++++- .../kernels/gpudnn/conv_kernel_register.cu | 3 +- .../kernels/gpudnn/conv_transpose_kernel.cu | 7 +- .../kernels/impl/warpctc_grad_kernel_impl.h | 2 +- .../kernels/impl/warpctc_kernel_impl.h | 67 +++++++++---------- .../kernels/impl/warprnnt_kernel_impl.h | 39 +++++------ 8 files changed, 103 insertions(+), 66 deletions(-) diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index 787aae13e40..f282a9fbf7c 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -666,7 +666,6 @@ file( # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/shape_kernel.cc # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/conv_kernel_igemm.cu # ############################################################################ - # kernels/fusion kernels/selected_rows ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu # kernels/kps ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/kps/elementwise_kernel.cu @@ -713,10 +712,7 @@ file( kernels/cuda_kernels/*.cc kernels/cuda_kernels/*.cu kernels/funcs/blas/*.cc - kernels/ernie_core/*.cu - kernels/ernie_core/rms_norm_kernel_register.cu - kernels/ernie_core/top_p_sampling_kernel_register.cu - kernels/ernie_core/fused_bias_act_kernel_register.cu) + kernels/ernie_core/*.cu) set(CUSTOM_DEVICE_SRCS ${CUDA_SRCS} ${CC_SRCS} ${ERNIE_CORE_SRCS}) @@ -735,8 +731,13 @@ add_library( target_include_directories( ${TARGET_NAME} - PRIVATE ${PADDLE_SOURCE_DIR} ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/kernels - ${CUDA_INCLUDE_DIRS} ${WARPCTC_INCLUDE_DIR} ${WARPRNNT_INCLUDE_DIR} ${PADDLE_SOURCE_DIR}/third_party/pybind/include + PRIVATE ${PADDLE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/kernels + ${CUDA_INCLUDE_DIRS} + ${WARPCTC_INCLUDE_DIR} + ${WARPRNNT_INCLUDE_DIR} + ${PADDLE_SOURCE_DIR}/third_party/pybind/include ${PADDLE_SOURCE_DIR}/paddle/phi/api/include/compat) target_link_libraries( diff --git a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu index c8d69cecae1..f935014d17b 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu" //NOLINT #include "paddle/phi/kernels/index_elementwise_put_grad_kernel.h" - PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_grad, metax_gpu, ALL_LAYOUT, @@ -31,3 +31,19 @@ PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_grad, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} +PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_with_tensor_grad, + metax_gpu, + ALL_LAYOUT, + phi::IndexElementwisePutWithTensorGradKernel, + bool, + float, + double, + int, + int8_t, + int64_t, + int16_t, + uint8_t, + phi::float16, + phi::bfloat16, + phi::complex64, + phi::complex128) {} diff --git a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu index 391dd908a8d..533204b8102 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu" //NOLINT #include "paddle/phi/kernels/index_elementwise_put_kernel.h" - PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put, metax_gpu, ALL_LAYOUT, @@ -31,3 +31,19 @@ PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} +PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_with_tensor, + metax_gpu, + ALL_LAYOUT, + phi::IndexElementwisePutWithTensorKernel, + bool, + float, + double, + int, + int8_t, + int64_t, + int16_t, + uint8_t, + phi::float16, + phi::bfloat16, + phi::complex64, + phi::complex128) {} diff --git a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu index bf129fed05c..0a83b504c76 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu @@ -81,7 +81,8 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, args.cdesc.set( dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn(), groups); #else - args.cdesc.set(dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn()); + args.cdesc.set( + dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn()); #endif #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu index 928201c705f..532b7af0db4 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu @@ -93,7 +93,12 @@ void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x, args.idesc.set(*transformed_out, iwo_groups); args.wdesc.set(*filter, layout_tensor, iwo_groups); args.odesc.set(*transformed_x, iwo_groups); - args.cdesc.set(dtype, padding_common, strides, dilations_, phi::AllowTF32Cudnn(), c_groups); + args.cdesc.set(dtype, + padding_common, + strides, + dilations_, + phi::AllowTF32Cudnn(), + c_groups); #ifdef PADDLE_WITH_HIP SearchResult bwd_result; diff --git a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h index dc9bc376e63..16b740d5523 100644 --- a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h @@ -16,7 +16,6 @@ #include -#include "third_party/warpctc/include/ctc.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/eigen/common.h" @@ -24,6 +23,7 @@ #include "paddle/phi/kernels/funcs/sequence_padding.h" #include "paddle/phi/kernels/funcs/sequence_scale.h" #include "paddle/utils/optional.h" +#include "third_party/warpctc/include/ctc.h" namespace phi { diff --git a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h index e0b15feca03..cb39a0171ba 100644 --- a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h @@ -16,7 +16,6 @@ #include -#include "third_party/warpctc/include/ctc.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/tensor_utils.h" @@ -25,6 +24,7 @@ #include "paddle/phi/kernels/funcs/sequence_padding.h" #include "paddle/phi/kernels/funcs/sequence_scale.h" #include "paddle/utils/optional.h" +#include "third_party/warpctc/include/ctc.h" namespace phi { @@ -59,15 +59,15 @@ class ComputeCtcLossFunctor { void* workspace, ctcOptions options) { return compute_ctc_loss(activations, - gradients, - flat_labels, - label_lengths, - input_lengths, - static_cast(alphabet_size), - static_cast(minibatch), - costs, - workspace, - options); + gradients, + flat_labels, + label_lengths, + input_lengths, + static_cast(alphabet_size), + static_cast(minibatch), + costs, + workspace, + options); } }; @@ -84,17 +84,16 @@ class ComputeCtcLossFunctor { double* costs, void* workspace, ctcOptions options) { - return compute_ctc_loss_double( - activations, - gradients, - flat_labels, - label_lengths, - input_lengths, - static_cast(alphabet_size), - static_cast(minibatch), - costs, - workspace, - options); + return compute_ctc_loss_double(activations, + gradients, + flat_labels, + label_lengths, + input_lengths, + static_cast(alphabet_size), + static_cast(minibatch), + costs, + workspace, + options); } }; @@ -140,21 +139,19 @@ class WarpCTCFunctor { size_t workspace_bytes = 0; ctcStatus_t status = CTC_STATUS_UNKNOWN_ERROR; if (sizeof(T) == 4) { - status = - get_workspace_size(cpu_label_lengths, - cpu_input_lengths, - static_cast(sequence_width), - static_cast(num_sequences), - options_, - &workspace_bytes); + status = get_workspace_size(cpu_label_lengths, + cpu_input_lengths, + static_cast(sequence_width), + static_cast(num_sequences), + options_, + &workspace_bytes); } else { - status = get_workspace_size_double( - cpu_label_lengths, - cpu_input_lengths, - static_cast(sequence_width), - static_cast(num_sequences), - options_, - &workspace_bytes); + status = get_workspace_size_double(cpu_label_lengths, + cpu_input_lengths, + static_cast(sequence_width), + static_cast(num_sequences), + options_, + &workspace_bytes); } PADDLE_ENFORCE_EQ( CTC_STATUS_SUCCESS, diff --git a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h index 457fdcb9bff..8e3ab6fcdac 100644 --- a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h @@ -16,12 +16,12 @@ #include -#include "third_party/warprnnt/include/rnnt.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "third_party/warprnnt/include/rnnt.h" namespace phi { @@ -56,15 +56,15 @@ class ComputeRnntLossFunctor { void* workspace, rnntOptions options) { return compute_rnnt_loss(activations, - gradients, - label, - label_lengths, - input_lengths, - static_cast(alphabet_size), - static_cast(minibatch), - costs, - workspace, - options); + gradients, + label, + label_lengths, + input_lengths, + static_cast(alphabet_size), + static_cast(minibatch), + costs, + workspace, + options); } }; @@ -82,15 +82,15 @@ class ComputeRnntLossFunctor { void* workspace, rnntOptions options) { return compute_rnnt_loss_fp64(activations, - gradients, - label, - label_lengths, - input_lengths, - static_cast(alphabet_size), - static_cast(minibatch), - costs, - workspace, - options); + gradients, + label, + label_lengths, + input_lengths, + static_cast(alphabet_size), + static_cast(minibatch), + costs, + workspace, + options); } }; @@ -117,6 +117,7 @@ class WarpRNNTFunctor { * \param blank blank label used in rnnt loss function. * \param cpu_loss loss of each example in CPU memory. */ + void operator()(const Context& dev_ctx, const T* input, T* gradient, From a513aaeb4c895177cd1c6b91d8d3b3c6b8ffe5a6 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Wed, 17 Sep 2025 11:07:44 +0800 Subject: [PATCH 21/58] change_build_917 (#29) * [Metax_change_ut] * fix sum&collect_fpn_proposals op register * modify profile * [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel' * [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels * [Metax] con2d_grad use gpudnn * blas handle support * [Metax] register some kernels & update CMakeLists * [Metax] fix metax unittest fail * [Metax] add group_norm & label_smooth kernel and update matmul kernel * [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register * add test * add test * [test] chang the logic of workspace_host in cholesky_kernel_register alloc(cpuplace,size), test pass alloc(cpuplace, size, stream), crash * [Metax] fix compile fail * Revert "[Metax] fix compile fail" This reverts commit 83bc87f686227962b0262e044225c6ed5507b824. * [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h' * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] con2d_grad use gpudnn * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] change_patch * [Metax] update unit test CMakeLists.txt * [Metax] update unit test CMakeLists.txt * [feature] add unique_consecutive kernel * [metax] add some kernel * [metax] add some kernel * [Metax] register baddbmm kernel & update blas api * [Metax] register baddbmm kernel & update blas api * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [feature] add add unique_consecutive kernel.cu * [fix] fix some test case due to missing op register * [fix] fix some fail text * [metax]fix lu eigvalshsqueeze rnn kernel * [metax]fix lu eigvalshsqueeze rnn kernel * add and fix some kernels * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [Metax] fix conflict * [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure * [Metax] update repeat_interleave kernel & ignore max op test * [metax]fix lu eigvalshsqueeze rnn kernel * [metax] chang patch fix copy * [metax] chang patch fix copy * [Metax] update metax_gpu unit test * [Metax] fix test CMakeList.txt * [metax]change_cupti_and_fix_softmax * [metax]change_patch * [metax]change_patch * [metax] updata_qr_kernel * [metax] updata_qr_kernel * [Metax] fix cufft and fix some blas kernel apply * [metax] fix bug * [Metax] add github action * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]fix_code style and index_elementwise_put_kernel * [metax]change_build --------- Co-authored-by: Mingkun.Zhang <2496808993@qq.com> Co-authored-by: metax666 Co-authored-by: jiaxinWang-metax <189149612@qq.com> Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Co-authored-by: chezhang <1376507468@qq.com> Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com> Co-authored-by: ZhouDuan <1184319564@qq.com> --- backends/metax_gpu/build.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh index e3c4304e5f8..2bee14930a3 100755 --- a/backends/metax_gpu/build.sh +++ b/backends/metax_gpu/build.sh @@ -24,14 +24,14 @@ pip uninstall paddlepaddle -y git submodule sync --recursive && git submodule update --init --recursive -export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080 -export +# export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080 +# export pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package # install paddle python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ -unset http_proxy https_proxy +# unset http_proxy https_proxy # apply patch bash change_patch.sh From 4eb455e0f14f4a74bfd91e3fd44d67500af2a2c0 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Wed, 17 Sep 2025 11:19:49 +0800 Subject: [PATCH 22/58] chang_build (#30) * [Metax_change_ut] * fix sum&collect_fpn_proposals op register * modify profile * [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel' * [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels * [Metax] con2d_grad use gpudnn * blas handle support * [Metax] register some kernels & update CMakeLists * [Metax] fix metax unittest fail * [Metax] add group_norm & label_smooth kernel and update matmul kernel * [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register * add test * add test * [test] chang the logic of workspace_host in cholesky_kernel_register alloc(cpuplace,size), test pass alloc(cpuplace, size, stream), crash * [Metax] fix compile fail * Revert "[Metax] fix compile fail" This reverts commit 83bc87f686227962b0262e044225c6ed5507b824. * [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h' * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] con2d_grad use gpudnn * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] change_patch * [Metax] update unit test CMakeLists.txt * [Metax] update unit test CMakeLists.txt * [feature] add unique_consecutive kernel * [metax] add some kernel * [metax] add some kernel * [Metax] register baddbmm kernel & update blas api * [Metax] register baddbmm kernel & update blas api * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [feature] add add unique_consecutive kernel.cu * [fix] fix some test case due to missing op register * [fix] fix some fail text * [metax]fix lu eigvalshsqueeze rnn kernel * [metax]fix lu eigvalshsqueeze rnn kernel * add and fix some kernels * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [Metax] fix conflict * [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure * [Metax] update repeat_interleave kernel & ignore max op test * [metax]fix lu eigvalshsqueeze rnn kernel * [metax] chang patch fix copy * [metax] chang patch fix copy * [Metax] update metax_gpu unit test * [Metax] fix test CMakeList.txt * [metax]change_cupti_and_fix_softmax * [metax]change_patch * [metax]change_patch * [metax] updata_qr_kernel * [metax] updata_qr_kernel * [Metax] fix cufft and fix some blas kernel apply * [metax] fix bug * [Metax] add github action * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]fix_code style and index_elementwise_put_kernel * [metax]change_build * [metax]change_build --------- Co-authored-by: Mingkun.Zhang <2496808993@qq.com> Co-authored-by: metax666 Co-authored-by: jiaxinWang-metax <189149612@qq.com> Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Co-authored-by: chezhang <1376507468@qq.com> Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com> Co-authored-by: ZhouDuan <1184319564@qq.com> --- backends/metax_gpu/build.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh index 2bee14930a3..de409153472 100755 --- a/backends/metax_gpu/build.sh +++ b/backends/metax_gpu/build.sh @@ -22,12 +22,16 @@ pip uninstall paddlepaddle -y # init paddle git submodule sync --recursive && git submodule update --init --recursive +sleep 1000000 +unset http_proxy https_proxy # export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080 # export pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package # install paddle + + python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ From 1773978409b36845416e6491a6b5a2e06ff49992 Mon Sep 17 00:00:00 2001 From: jxwangmetax <189149612@qq.com> Date: Wed, 17 Sep 2025 13:59:58 +0800 Subject: [PATCH 23/58] [metax]modify kernel (#31) * modify cmake for warpctc and warprnnt * modify conv for tf32 and fp32 * modify conv kernel * modify library to static library * modify kernel --- backends/metax_gpu/patch/paddle.patch | 257 ++++++++++++++------------ 1 file changed, 138 insertions(+), 119 deletions(-) diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index e56826c4f3e..667d9f75d1c 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -16,16 +16,16 @@ index cfada544d4..a690e97d74 100644 - set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header}) + # set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header}) endif() - + set(EIGEN_INCLUDE_DIR ${SOURCE_DIR}) diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc index bff0f2bf70..9376b5781f 100644 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc @@ -16,7 +16,7 @@ - + #include - + -#include "paddle/fluid/platform/enforce.h" +// #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/core/os_info.h" @@ -76,7 +76,7 @@ index c0080f0a5e..458ca3e2e8 100644 + __macro(cudnnDestroyActivationDescriptor); \ + __macro(cudnnSetRNNDescriptor_v6); CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) - + #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000 @@ -152,7 +161,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \ @@ -91,11 +91,11 @@ index c0080f0a5e..458ca3e2e8 100644 + __macro(cudnnRNNForwardInferenceEx); CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif - + @@ -195,40 +209,6 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif - + -#if CUDNN_VERSION < 90000 -#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \ - __macro(cudnnGetRNNParamsSize); \ @@ -132,15 +132,15 @@ index c0080f0a5e..458ca3e2e8 100644 -#endif } // namespace dynload } // namespace phi - + diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h -index 1547909d92..66b2779392 100644 +index 1547909d92..ef20838434 100644 --- a/paddle/phi/backends/dynload/cufft.h +++ b/paddle/phi/backends/dynload/cufft.h @@ -1,3 +1,4 @@ +// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - + Licensed under the Apache License, Version 2.0 (the "License"); @@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name); cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle(); \ @@ -160,23 +160,23 @@ index 59e92955c9..d2f8c2da15 100644 @@ -24,8 +24,8 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/dynamic_loader.h" #include "paddle/phi/common/port.h" - + -namespace phi { -namespace dynload { +// namespace phi { +// namespace dynload { - + extern std::once_flag cupti_dso_flag; extern void *cupti_dso_handle; @@ -71,7 +71,7 @@ extern void *cupti_dso_handle; CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP); - + #undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP -} // namespace dynload -} // namespace phi +// } // namespace dynload +// } // namespace phi - + -#endif // PADDLE_WITH_CUPTI +#endif // PADDLE_WITH_CUPTI \ No newline at end of file @@ -226,32 +226,32 @@ index c5309e7e11..3328571380 100644 } \ }; \ diff --git a/paddle/phi/backends/gpu/cuda/cuda_device_function.h b/paddle/phi/backends/gpu/cuda/cuda_device_function.h -index 4ff2e528a9..81421c8ca1 100644 +index 4ff2e528a9..23f7f4b583 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h @@ -1,3 +1,4 @@ +// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - + Licensed under the Apache License, Version 2.0 (the "License"); @@ -25,7 +26,7 @@ namespace phi { namespace backends { namespace gpu { - + -#define FULL_WARP_MASK 0xFFFFFFFF +#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL #define CREATE_SHFL_MASK(mask, predicate) \ mask = __ballot_sync(FULL_WARP_MASK, (predicate)) - + @@ -45,12 +46,12 @@ namespace gpu { - + template __forceinline__ __device__ T -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { +CudaShuffleDownSync(unsigned long long mask, T val, int delta, int width = warpSize) { return __shfl_down_sync(mask, val, static_cast(delta), width); } - + template -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, +__forceinline__ __device__ T CudaShuffleXorSync(unsigned long long mask, @@ -259,7 +259,7 @@ index 4ff2e528a9..81421c8ca1 100644 int width = warpSize) { return __shfl_xor_sync(mask, val, width); @@ -58,14 +59,14 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, - + template <> __forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync( - unsigned mask, phi::dtype::float16 val, int delta, int width) { @@ -267,7 +267,7 @@ index 4ff2e528a9..81421c8ca1 100644 return phi::dtype::float16(__shfl_down_sync( mask, val.to_half(), static_cast(delta), width)); } - + template <> __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( - unsigned mask, phi::dtype::bfloat16 val, int delta, int width) { @@ -276,7 +276,7 @@ index 4ff2e528a9..81421c8ca1 100644 return phi::dtype::bfloat16(__shfl_down_sync( mask, val.to_nv_bfloat16(), static_cast(delta), width)); @@ -77,7 +78,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( - + template <> __forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - unsigned mask, phi::dtype::complex val, int delta, int width) { @@ -285,7 +285,7 @@ index 4ff2e528a9..81421c8ca1 100644 mask, static_cast(val.real), static_cast(delta), width)); float imag = static_cast(__shfl_down_sync( @@ -87,7 +88,7 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - + template <> __forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - unsigned mask, phi::dtype::complex val, int delta, int width) { @@ -294,14 +294,14 @@ index 4ff2e528a9..81421c8ca1 100644 static_cast(__shfl_down_sync(mask, static_cast(val.real), @@ -103,13 +104,13 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - + template <> __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync( - unsigned mask, phi::dtype::float16 val, int width) { + unsigned long long mask, phi::dtype::float16 val, int width) { return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width)); } - + template <> __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( - unsigned mask, phi::dtype::bfloat16 val, int width) { @@ -310,7 +310,7 @@ index 4ff2e528a9..81421c8ca1 100644 return phi::dtype::bfloat16( __shfl_xor_sync(mask, val.to_nv_bfloat16(), width)); @@ -121,7 +122,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( - + template <> __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - unsigned mask, phi::dtype::complex val, int width) { @@ -319,7 +319,7 @@ index 4ff2e528a9..81421c8ca1 100644 __shfl_xor_sync(mask, static_cast(val.real), width)); float imag = static_cast( @@ -131,7 +132,7 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - + template <> __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - unsigned mask, phi::dtype::complex val, int width) { @@ -328,14 +328,14 @@ index 4ff2e528a9..81421c8ca1 100644 __shfl_xor_sync(mask, static_cast(val.real), width)); double imag = static_cast( @@ -141,7 +142,7 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - + template __forceinline__ __device__ T -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { +CudaShuffleSync(unsigned long long mask, T val, int src_line, int width = 32) { return __shfl_sync(mask, val, src_line, width); } - + @@ -160,7 +161,7 @@ __device__ T reduceSum(T val, int tid, int len) { // but most card's warp size is 32. const int warpSize = 32; @@ -343,7 +343,7 @@ index 4ff2e528a9..81421c8ca1 100644 - unsigned mask = 0u; + unsigned long long mask = 0ull; CREATE_SHFL_MASK(mask, tid < len); - + for (int offset = warpSize / 2; offset > 0; offset /= 2) diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h index 024a7de73e..1e4cdf16be 100644 @@ -351,7 +351,7 @@ index 024a7de73e..1e4cdf16be 100644 +++ b/paddle/phi/core/enforce.h @@ -45,7 +45,9 @@ limitations under the License. */ #endif - + #ifdef PADDLE_WITH_CUDA -#include "paddle/phi/backends/dynload/cublas.h" +// #include "paddle/phi/backends/dynload/../../../../../cublas.h" @@ -361,9 +361,9 @@ index 024a7de73e..1e4cdf16be 100644 #include "paddle/phi/backends/dynload/curand.h" #include "paddle/phi/backends/dynload/cusolver.h" @@ -97,7 +99,7 @@ inline bool is_error(bool stat) { return !stat; } - + void ThrowWarnInternal(const std::string& message); - + -#if defined(__CUDA_ARCH__) +#if defined(__CUDACC__) // For cuda, the assertions can affect performance and it is therefore @@ -379,7 +379,7 @@ index 024a7de73e..1e4cdf16be 100644 } while (0) #elif defined(__HIPCC__) @@ -757,4 +759,4 @@ inline void retry_sleep(unsigned millisecond) { - + } // namespace enforce using namespace enforce; // NOLINT -} // namespace phi @@ -392,7 +392,7 @@ index c646e487d0..325122175c 100644 @@ -25,8 +25,9 @@ #else #include - + -#include "paddle/phi/backends/dynload/cublas.h" -#include "paddle/phi/backends/dynload/cublasLt.h" +// #include "paddle/phi/backends/dynload/cublas.h" @@ -400,16 +400,16 @@ index c646e487d0..325122175c 100644 +// #include "paddle/phi/backends/dynload/cublasLt.h" #include "paddle/phi/backends/dynload/cudnn.h" #endif - + @@ -90,7 +91,7 @@ DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode, - + // TODO(Ming Huang): Since there is no blasLt handler, // use rocblas_handle for workaround. -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); +// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); - + #undef DECLARE_TYPE_FOR_GPU - + diff --git a/paddle/phi/core/platform/device_context.h b/paddle/phi/core/platform/device_context.h index 2d02eb370b..8a7233e34e 100644 --- a/paddle/phi/core/platform/device_context.h @@ -430,58 +430,58 @@ index d69eb67d6f..1d8b6e9375 100644 --- a/paddle/phi/kernels/cpu/index_select_impl.h +++ b/paddle/phi/kernels/cpu/index_select_impl.h @@ -18,7 +18,7 @@ - + #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" - + diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu index cb35feee32..64f5bd24ac 100644 --- a/paddle/phi/kernels/funcs/fc_functor.cu +++ b/paddle/phi/kernels/funcs/fc_functor.cu @@ -16,12 +16,12 @@ limitations under the License. */ - + #include "paddle/phi/backends/all_context.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/fc_functor.h" - + #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h" +// #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h" #include "paddle/phi/kernels/funcs/quant_dequant.h" #include "paddle/phi/kernels/matmul_kernel.h" - + diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu index 88663ec880..98b93072a3 100644 --- a/paddle/phi/kernels/funcs/gru_compute.cu +++ b/paddle/phi/kernels/funcs/gru_compute.cu @@ -12,7 +12,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/gru_compute.h" - + #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h" #include "paddle/phi/kernels/funcs/detail/gru_kernel.h" - + diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h index 15e1a4a3c3..e4780538d7 100644 --- a/paddle/phi/kernels/funcs/math/context_project.h +++ b/paddle/phi/kernels/funcs/math/context_project.h @@ -18,7 +18,7 @@ #include - + #include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/im2col.h" - + namespace phi { diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu index e101224970..a52eb6096f 100644 @@ -489,14 +489,14 @@ index e101224970..a52eb6096f 100644 +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu @@ -15,11 +15,13 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/matrix_inverse.h" - + #include "paddle/phi/common/memory_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" - + namespace phi { namespace funcs { - + + + template @@ -514,28 +514,28 @@ index 558d363b39..05da04b517 100644 +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/scatter.cu.h" - + diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu index 8b0baf5f5f..260482f124 100644 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu @@ -27,7 +27,7 @@ namespace cub = hipcub; - + #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h" - + -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_cuda_utils.h" - + namespace phi { diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h -index e30d440ff3..3c74792690 100644 +index e30d440ff3..108edda7ca 100644 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h +++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h @@ -30,11 +30,11 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/primitive/functor_primitives.h" - + -#define FINAL_MASK 0xffffffff +#define FINAL_MASK 0xffffffffffffffffull #ifdef PADDLE_WITH_HIP @@ -545,7 +545,7 @@ index e30d440ff3..3c74792690 100644 +#define WARP_SIZE 64 #endif #define MAX_NUM_THREADS 1024 - + @@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair topk[], for (int k = beam_size - 2; k >= 0; k--) { if (largest) { @@ -606,7 +606,7 @@ index e30d440ff3..3c74792690 100644 + topk[0 + offset].v = p.v; + topk[0 + offset].id = p.id; } - + template @@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair topk[], template @@ -662,7 +662,7 @@ index e30d440ff3..3c74792690 100644 + // topk + MaxLength - *beam, src, tid, dim, *max, length, largest); } } - + @@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair shared_max[], shared_max[wid] = input_now; } @@ -697,7 +697,7 @@ index e30d440ff3..3c74792690 100644 - if (--(*k) == 0) break; + // if (--(*k) == 0) break; + unsigned long long mask = 0ull; - + - unsigned mask = 0u; + // unsigned mask = 0u; CREATE_SHFL_MASK(mask, true); @@ -721,7 +721,7 @@ index e30d440ff3..3c74792690 100644 + return ret; } - + static __device__ __forceinline__ unsigned int SetBitfield( unsigned int val, unsigned int to_insert, int pos, int len) { unsigned int ret; @@ -743,7 +743,7 @@ index e30d440ff3..3c74792690 100644 + ret = (static_cast(val) << (64 - pos - len)) >> (64 - len); return ret; } - + @@ -507,9 +556,9 @@ struct Bitfield { int pos, int len) { @@ -771,7 +771,7 @@ index e30d440ff3..3c74792690 100644 + return ::__lane_id(); + // return lane_id; } - + __device__ __forceinline__ unsigned GetLaneMaskLe() { unsigned mask; - asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask)); @@ -780,17 +780,17 @@ index e30d440ff3..3c74792690 100644 + return ((uint64_t(1) << ::__lane_id()) << 1) - 1; + // return mask; } - + template @@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input, - + // 1. Find the k-th value T kth_value = static_cast(0); - RadixSearch::RadixType, IndexType, false>( + // RadixSearch::RadixType, IndexType, false>( + RadixSearch::RadixType, IndexType, false>( cur_input, k, num_cols, shared_mem, &kth_value); - + __shared__ int64_t block_min_idx; @@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx, } @@ -803,12 +803,12 @@ index 32db61532f..0220316bc3 100644 +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h @@ -15,7 +15,7 @@ #pragma once - + #if defined(PADDLE_WITH_CUDA) -#include "paddle/phi/backends/dynload/cublasLt.h" +// #include "paddle/phi/backends/dynload/cublasLt.h" #endif - + #include "glog/logging.h" diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h index 9d4bb18d55..ea42cc10a9 100644 @@ -830,12 +830,12 @@ index b8cfdbf3ce..fa14b94a77 100644 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu @@ -14,7 +14,7 @@ - + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h" +#include "kernels/metax_kernel/mmha_util.cu.h" - + namespace phi { namespace fusion { diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu @@ -843,14 +843,27 @@ index e838778952..83e805e75a 100644 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu @@ -14,7 +14,7 @@ - + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h" +#include "kernels/metax_kernel/mmha_util.cu.h" - + namespace phi { namespace fusion { +diff --git a/paddle/phi/kernels/gpu/correlation_kernel.cu b/paddle/phi/kernels/gpu/correlation_kernel.cu +index 4c93778bde..c7bdf8a2cc 100644 +--- a/paddle/phi/kernels/gpu/correlation_kernel.cu ++++ b/paddle/phi/kernels/gpu/correlation_kernel.cu +@@ -103,7 +103,7 @@ void CorrelationCUDAKernel(const Context &dev_ctx, + int stride2, + int corr_type_multiply, + DenseTensor *out) { +- bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU; ++ bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM; + PADDLE_ENFORCE_EQ( + is_gpu_place, + true, diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h index f0cca0f701..02ea957240 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv.h @@ -863,9 +876,22 @@ index f0cca0f701..02ea957240 100644 -#include "paddle/phi/kernels/impl/conv_cudnn_impl.h" +#include "kernels/gpudnn/conv_gpudnn.h" +#include "kernels/impl/conv_cudnn_impl.h" - + namespace phi { // To determine use cudnn or not. +diff --git a/paddle/phi/kernels/gpu/dgc_kernel.cu b/paddle/phi/kernels/gpu/dgc_kernel.cu +index c2ddfa1347..c6adf5a6de 100644 +--- a/paddle/phi/kernels/gpu/dgc_kernel.cu ++++ b/paddle/phi/kernels/gpu/dgc_kernel.cu +@@ -188,7 +188,7 @@ void DGCKernel(const Context& dev_ctx, + int buf_size = paddle::communication::dgc::get_buffer_size(k); + phi::Allocator::AllocationPtr tmp_ious_data; + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +- if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { ++ if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) { + tmp_ious_data = phi::memory_utils::Alloc( + dev_ctx.GetPlace(), + buf_size, diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h index 29fa252e96..4ae72b0935 100644 --- a/paddle/phi/kernels/gpu/gelu_funcs.h @@ -890,7 +916,7 @@ index 29fa252e96..4ae72b0935 100644 +// #endif return tanhf(x); } - + diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu index 11efd87965..679db14c24 100644 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu @@ -901,9 +927,9 @@ index 11efd87965..679db14c24 100644 #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" +#include "kernels/gpudnn/softmax_gpudnn.h" - + namespace phi { - + diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu index 63c35dd4ee..15da9aea45 100644 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu @@ -914,9 +940,9 @@ index 63c35dd4ee..15da9aea45 100644 #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" +#include "kernels/gpudnn/softmax_gpudnn.h" - + namespace phi { - + diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu index 1bdbe1564c..f753b54bc6 100644 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu @@ -930,6 +956,19 @@ index 1bdbe1564c..f753b54bc6 100644 #include "paddle/phi/kernels/impl/qr_kernel_impl.h" #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" #include "paddle/phi/kernels/lstsq_kernel.h" +diff --git a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu +index 05a977828f..5136608c41 100644 +--- a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu ++++ b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu +@@ -58,7 +58,7 @@ void ShuffleBatchKernel(const Context& dev_ctx, + int64_t seed_int = 0; + if (seed.initialized()) { + const auto& seed_place = seed.place().GetType(); +- bool is_gpu_place = seed_place == phi::AllocationType::GPU; ++ bool is_gpu_place = seed_place == phi::AllocationType::GPU || seed_place == phi::AllocationType::CUSTOM; + if (is_gpu_place) { + // NOTE: We have overwritten GetKernelTypeForVar, so seed_place would + // not be CUDAPlace in practice. This case would only happen in Python diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h index 9bc5326c90..79b57a8203 100644 --- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h @@ -948,7 +987,7 @@ index cf80666b4e..ca76e055fb 100644 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h @@ -19,7 +19,7 @@ limitations under the License. */ - + #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/baddbmm_grad_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" @@ -961,14 +1000,14 @@ index 2789cb59a2..b91b076f7f 100644 --- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h +++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h @@ -20,7 +20,7 @@ limitations under the License. */ - + #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/baddbmm_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" - + diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h index 9a21c23666..86413d1577 100644 --- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h @@ -993,7 +1032,7 @@ index 4459a931da..837c8682b8 100644 -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" - + namespace phi { diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h index ad9e9197dd..5478d9817d 100644 @@ -1013,31 +1052,31 @@ index e6b3960f6d..564125f1f6 100644 --- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h +++ b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h @@ -56,8 +56,8 @@ HOSTDEVICE T igam(const T a, const T x) { - + template HOSTDEVICE T igamc(const T a, const T x) { - static T big = 4.503599627370496e15; - static T biginv = 2.22044604925031308085e-16; + const static T big = 4.503599627370496e15; + const static T biginv = 2.22044604925031308085e-16; - + if ((x <= T{0}) || (a <= T{0})) return (T{1.0}); - + diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h index 410fb3c560..009ce03440 100644 --- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h @@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) { - + template HOSTDEVICE T digamma(T x) { - static T pi = T{3.14159265358979323846}; + const static T pi = T{3.14159265358979323846}; - + if (x == T{0.0}) { T inf = std::numeric_limits::infinity(); diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h -index 5ebbc8d2db..48acf8d0cd 100644 +index 5ebbc8d2db..c7b6c338e2 100644 --- a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h +++ b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h @@ -15,8 +15,9 @@ limitations under the License. */ @@ -1049,11 +1088,11 @@ index 5ebbc8d2db..48acf8d0cd 100644 +#include "kernels/funcs/blas/cublaslt.h" +#include "kernels/funcs/quant_dequant.h" +#include "kernels/metax_kernel/metax_context.h" - + #pragma once - + @@ -668,7 +669,7 @@ void LLMGemm(const phi::GPUContext& dev_ctx, - + { auto helper = - std::make_unique(m, k, n, dev_ctx.cublaslt_handle()); @@ -1067,12 +1106,12 @@ index 1f319c4ae3..9186eb6906 100644 +++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once - + #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" - + namespace phi { diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h index 6f03f76eeb..5fe2c3e7dc 100644 @@ -1080,13 +1119,13 @@ index 6f03f76eeb..5fe2c3e7dc 100644 +++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once - + #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" - + diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h index 7b85903776..3f4b298807 100644 --- a/paddle/phi/kernels/impl/merged_momentum_impl.h @@ -1118,31 +1157,11 @@ index 4099d8b506..baef2cd643 100644 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h +++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h @@ -14,7 +14,7 @@ - + #pragma once - + -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" - -diff --git a/third_party/flagcx b/third_party/flagcx -index 7c469f4af9..7e6c4cc3ca 160000 ---- a/third_party/flagcx -+++ b/third_party/flagcx -@@ -1 +1 @@ --Subproject commit 7c469f4af991bf0f64b8f76d66f8e307a5eaea3f -+Subproject commit 7e6c4cc3cad3fce9b3dedfe46a9d195d616e8ffa -diff --git a/third_party/flashattn b/third_party/flashattn -index 581e48aa69..749aca3807 160000 ---- a/third_party/flashattn -+++ b/third_party/flashattn -@@ -1 +1 @@ --Subproject commit 581e48aa693a17ec3676ec2715d46130310d318d -+Subproject commit 749aca380794b472096d4e7ea01dd252ab0887c9 -diff --git a/third_party/yaml-cpp b/third_party/yaml-cpp ---- a/third_party/yaml-cpp -+++ b/third_party/yaml-cpp -@@ -1 +1 @@ --Subproject commit 1d8ca1f35eb3a9c9142462b28282a848e5d29a91 -+Subproject commit 1d8ca1f35eb3a9c9142462b28282a848e5d29a91-dirty + From 69af38186ebfd6029d6e5b1a057d6e8fa389ee08 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Wed, 17 Sep 2025 17:07:26 +0800 Subject: [PATCH 24/58] change_metax_work (#32) * [Metax_change_ut] * fix sum&collect_fpn_proposals op register * modify profile * [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel' * [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels * [Metax] con2d_grad use gpudnn * blas handle support * [Metax] register some kernels & update CMakeLists * [Metax] fix metax unittest fail * [Metax] add group_norm & label_smooth kernel and update matmul kernel * [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register * add test * add test * [test] chang the logic of workspace_host in cholesky_kernel_register alloc(cpuplace,size), test pass alloc(cpuplace, size, stream), crash * [Metax] fix compile fail * Revert "[Metax] fix compile fail" This reverts commit 83bc87f686227962b0262e044225c6ed5507b824. * [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h' * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] con2d_grad use gpudnn * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] change_patch * [Metax] update unit test CMakeLists.txt * [Metax] update unit test CMakeLists.txt * [feature] add unique_consecutive kernel * [metax] add some kernel * [metax] add some kernel * [Metax] register baddbmm kernel & update blas api * [Metax] register baddbmm kernel & update blas api * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [feature] add add unique_consecutive kernel.cu * [fix] fix some test case due to missing op register * [fix] fix some fail text * [metax]fix lu eigvalshsqueeze rnn kernel * [metax]fix lu eigvalshsqueeze rnn kernel * add and fix some kernels * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [Metax] fix conflict * [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure * [Metax] update repeat_interleave kernel & ignore max op test * [metax]fix lu eigvalshsqueeze rnn kernel * [metax] chang patch fix copy * [metax] chang patch fix copy * [Metax] update metax_gpu unit test * [Metax] fix test CMakeList.txt * [metax]change_cupti_and_fix_softmax * [metax]change_patch * [metax]change_patch * [metax] updata_qr_kernel * [metax] updata_qr_kernel * [Metax] fix cufft and fix some blas kernel apply * [metax] fix bug * [Metax] add github action * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]fix_code style and index_elementwise_put_kernel * [metax]change_build * [metax]change_build * change_metax_work * change_metax_work --------- Co-authored-by: Mingkun.Zhang <2496808993@qq.com> Co-authored-by: metax666 Co-authored-by: jiaxinWang-metax <189149612@qq.com> Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Co-authored-by: chezhang <1376507468@qq.com> Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com> Co-authored-by: ZhouDuan <1184319564@qq.com> --- .github/workflows/metax_work.yaml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml index 0d3d2637cdd..c23112f0545 100644 --- a/.github/workflows/metax_work.yaml +++ b/.github/workflows/metax_work.yaml @@ -19,27 +19,28 @@ defaults: jobs: metax-gpu-test: runs-on: paddle-metax-runner-set + # runs-on: debug-paddle-runner-set steps: - name: Checkout repository run: | git config --global user.name "GitHub Actions" git config --global user.email "actions@github.com" - if [ "${{ github.event_name }}" == "pull_request" ]; then - BRANCH_NAME=${{ github.head_ref }} - else - BRANCH_NAME=${{ github.ref_name }} - fi - git clone \ --reference-if-able /home/runner/PaddleCustomDevice \ --depth=1 \ --shallow-submodules \ --jobs=8 \ - --branch $BRANCH_NAME \ + --branch ${{ github.base_ref }} \ --recurse-submodules \ https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git . + if [ "${{ github.event_name }}" == "pull_request" ]; then + git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head + git checkout pull/${{ github.event.pull_request.number }}/head + git submodule update --init --recursive + fi + - name: compile run: | From 7fe6f2dca92c3c0e3fb4c4ceb7f18a26560422e9 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Wed, 17 Sep 2025 17:18:26 +0800 Subject: [PATCH 25/58] change_build (#33) * [Metax_change_ut] * fix sum&collect_fpn_proposals op register * modify profile * [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel' * [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels * [Metax] con2d_grad use gpudnn * blas handle support * [Metax] register some kernels & update CMakeLists * [Metax] fix metax unittest fail * [Metax] add group_norm & label_smooth kernel and update matmul kernel * [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register * add test * add test * [test] chang the logic of workspace_host in cholesky_kernel_register alloc(cpuplace,size), test pass alloc(cpuplace, size, stream), crash * [Metax] fix compile fail * Revert "[Metax] fix compile fail" This reverts commit 83bc87f686227962b0262e044225c6ed5507b824. * [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h' * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] con2d_grad use gpudnn * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] change_patch * [Metax] update unit test CMakeLists.txt * [Metax] update unit test CMakeLists.txt * [feature] add unique_consecutive kernel * [metax] add some kernel * [metax] add some kernel * [Metax] register baddbmm kernel & update blas api * [Metax] register baddbmm kernel & update blas api * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [feature] add add unique_consecutive kernel.cu * [fix] fix some test case due to missing op register * [fix] fix some fail text * [metax]fix lu eigvalshsqueeze rnn kernel * [metax]fix lu eigvalshsqueeze rnn kernel * add and fix some kernels * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [Metax] fix conflict * [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure * [Metax] update repeat_interleave kernel & ignore max op test * [metax]fix lu eigvalshsqueeze rnn kernel * [metax] chang patch fix copy * [metax] chang patch fix copy * [Metax] update metax_gpu unit test * [Metax] fix test CMakeList.txt * [metax]change_cupti_and_fix_softmax * [metax]change_patch * [metax]change_patch * [metax] updata_qr_kernel * [metax] updata_qr_kernel * [Metax] fix cufft and fix some blas kernel apply * [metax] fix bug * [Metax] add github action * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]fix_code style and index_elementwise_put_kernel * [metax]change_build * [metax]change_build * change_metax_work * change_metax_work * change_metax_work --------- Co-authored-by: Mingkun.Zhang <2496808993@qq.com> Co-authored-by: metax666 Co-authored-by: jiaxinWang-metax <189149612@qq.com> Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Co-authored-by: chezhang <1376507468@qq.com> Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com> Co-authored-by: ZhouDuan <1184319564@qq.com> --- backends/metax_gpu/build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh index de409153472..dbd583c52ea 100755 --- a/backends/metax_gpu/build.sh +++ b/backends/metax_gpu/build.sh @@ -22,8 +22,8 @@ pip uninstall paddlepaddle -y # init paddle git submodule sync --recursive && git submodule update --init --recursive -sleep 1000000 -unset http_proxy https_proxy +# sleep 1000000 +# unset http_proxy https_proxy # export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080 From b22fc1317d786931c1aa8784ad30dd72b6dfc2fd Mon Sep 17 00:00:00 2001 From: jxwangmetax <189149612@qq.com> Date: Wed, 17 Sep 2025 17:58:21 +0800 Subject: [PATCH 26/58] [metax] modify fused_bias_dropout_residual_layer_norm (#34) * modify cmake for warpctc and warprnnt * modify conv for tf32 and fp32 * modify conv kernel * modify library to static library * modify kernel * modify fused_bias_dropout_residual_layer_norm --- backends/metax_gpu/patch/paddle.patch | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index 667d9f75d1c..b7bdb953077 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -470,6 +470,25 @@ index 88663ec880..98b93072a3 100644 #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h" #include "paddle/phi/kernels/funcs/detail/gru_kernel.h" +diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h +index 4eae698648..5c047723ea 100644 +--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h ++++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h +@@ -43,11 +43,11 @@ template + using LayerNormParamType = typename CudnnDataType::BatchNormParamType; + + inline static int GetDesiredBlockDim(int64_t block_dim) { +- const int kMaxBlockDim = 512; ++ const int kMaxBlockDim = 256; + #ifdef __HIPCC__ + const int lwarpSize = 64; + #else +- const int lwarpSize = 32; ++ const int lwarpSize = 64; + #endif + return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize; + } + diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h index 15e1a4a3c3..e4780538d7 100644 --- a/paddle/phi/kernels/funcs/math/context_project.h From c3d1444ef67441b9bb43f9fa5ee7c5a906a7f9df Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Wed, 17 Sep 2025 18:18:30 +0800 Subject: [PATCH 27/58] change_build (#35) * [Metax_change_ut] * fix sum&collect_fpn_proposals op register * modify profile * [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel' * [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels * [Metax] con2d_grad use gpudnn * blas handle support * [Metax] register some kernels & update CMakeLists * [Metax] fix metax unittest fail * [Metax] add group_norm & label_smooth kernel and update matmul kernel * [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register * add test * add test * [test] chang the logic of workspace_host in cholesky_kernel_register alloc(cpuplace,size), test pass alloc(cpuplace, size, stream), crash * [Metax] fix compile fail * Revert "[Metax] fix compile fail" This reverts commit 83bc87f686227962b0262e044225c6ed5507b824. * [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h' * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] con2d_grad use gpudnn * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] change_patch * [Metax] update unit test CMakeLists.txt * [Metax] update unit test CMakeLists.txt * [feature] add unique_consecutive kernel * [metax] add some kernel * [metax] add some kernel * [Metax] register baddbmm kernel & update blas api * [Metax] register baddbmm kernel & update blas api * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [feature] add add unique_consecutive kernel.cu * [fix] fix some test case due to missing op register * [fix] fix some fail text * [metax]fix lu eigvalshsqueeze rnn kernel * [metax]fix lu eigvalshsqueeze rnn kernel * add and fix some kernels * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [Metax] fix conflict * [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure * [Metax] update repeat_interleave kernel & ignore max op test * [metax]fix lu eigvalshsqueeze rnn kernel * [metax] chang patch fix copy * [metax] chang patch fix copy * [Metax] update metax_gpu unit test * [Metax] fix test CMakeList.txt * [metax]change_cupti_and_fix_softmax * [metax]change_patch * [metax]change_patch * [metax] updata_qr_kernel * [metax] updata_qr_kernel * [Metax] fix cufft and fix some blas kernel apply * [metax] fix bug * [Metax] add github action * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]fix_code style and index_elementwise_put_kernel * [metax]change_build * [metax]change_build * change_metax_work * change_metax_work * change_metax_work * change_metax_work --------- Co-authored-by: Mingkun.Zhang <2496808993@qq.com> Co-authored-by: metax666 Co-authored-by: jiaxinWang-metax <189149612@qq.com> Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Co-authored-by: chezhang <1376507468@qq.com> Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com> Co-authored-by: ZhouDuan <1184319564@qq.com> --- .github/workflows/metax_work.yaml | 6 ++++-- backends/metax_gpu/build.sh | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml index c23112f0545..74de39c2e13 100644 --- a/.github/workflows/metax_work.yaml +++ b/.github/workflows/metax_work.yaml @@ -31,14 +31,16 @@ jobs: --depth=1 \ --shallow-submodules \ --jobs=8 \ - --branch ${{ github.base_ref }} \ + --branch ${{ github.base_ref || github.ref_name}} \ + --recurse-submodules \ https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git . if [ "${{ github.event_name }}" == "pull_request" ]; then git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head git checkout pull/${{ github.event.pull_request.number }}/head - git submodule update --init --recursive + + # git submodule update --init --recursive fi diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh index dbd583c52ea..042b779a05c 100755 --- a/backends/metax_gpu/build.sh +++ b/backends/metax_gpu/build.sh @@ -21,7 +21,8 @@ pip uninstall paddlepaddle -y # init paddle -git submodule sync --recursive && git submodule update --init --recursive +# git submodule sync --recursive && git submodule update --init --recursive + # sleep 1000000 # unset http_proxy https_proxy From 569a867b358d9d3707c8d41dbbb0641d03e75de8 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Wed, 17 Sep 2025 18:21:54 +0800 Subject: [PATCH 28/58] change_build (#36) * [Metax_change_ut] * fix sum&collect_fpn_proposals op register * modify profile * [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel' * [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels * [Metax] con2d_grad use gpudnn * blas handle support * [Metax] register some kernels & update CMakeLists * [Metax] fix metax unittest fail * [Metax] add group_norm & label_smooth kernel and update matmul kernel * [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register * add test * add test * [test] chang the logic of workspace_host in cholesky_kernel_register alloc(cpuplace,size), test pass alloc(cpuplace, size, stream), crash * [Metax] fix compile fail * Revert "[Metax] fix compile fail" This reverts commit 83bc87f686227962b0262e044225c6ed5507b824. * [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h' * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] con2d_grad use gpudnn * [Metax]fix bug and add qr lstsq logsoftmax * [Metax] change_patch * [Metax] update unit test CMakeLists.txt * [Metax] update unit test CMakeLists.txt * [feature] add unique_consecutive kernel * [metax] add some kernel * [metax] add some kernel * [Metax] register baddbmm kernel & update blas api * [Metax] register baddbmm kernel & update blas api * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [feature] add add unique_consecutive kernel.cu * [fix] fix some test case due to missing op register * [fix] fix some fail text * [metax]fix lu eigvalshsqueeze rnn kernel * [metax]fix lu eigvalshsqueeze rnn kernel * add and fix some kernels * [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined * [Metax] fix conflict * [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure * [Metax] update repeat_interleave kernel & ignore max op test * [metax]fix lu eigvalshsqueeze rnn kernel * [metax] chang patch fix copy * [metax] chang patch fix copy * [Metax] update metax_gpu unit test * [Metax] fix test CMakeList.txt * [metax]change_cupti_and_fix_softmax * [metax]change_patch * [metax]change_patch * [metax] updata_qr_kernel * [metax] updata_qr_kernel * [Metax] fix cufft and fix some blas kernel apply * [metax] fix bug * [Metax] add github action * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]chaneg build * [metax]fix_code style and index_elementwise_put_kernel * [metax]change_build * [metax]change_build * change_metax_work * change_metax_work * change_metax_work * change_metax_work * change_metax_work --------- Co-authored-by: Mingkun.Zhang <2496808993@qq.com> Co-authored-by: metax666 Co-authored-by: jiaxinWang-metax <189149612@qq.com> Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Co-authored-by: chezhang <1376507468@qq.com> Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com> Co-authored-by: ZhouDuan <1184319564@qq.com> --- .github/workflows/metax_work.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml index 74de39c2e13..51c0c62cef6 100644 --- a/.github/workflows/metax_work.yaml +++ b/.github/workflows/metax_work.yaml @@ -32,7 +32,6 @@ jobs: --shallow-submodules \ --jobs=8 \ --branch ${{ github.base_ref || github.ref_name}} \ - --recurse-submodules \ https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git . From 0edc6f6549fff51d459bf9a77bfbedf4e6a33beb Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Thu, 18 Sep 2025 10:46:15 +0800 Subject: [PATCH 29/58] change_warpctc.cmake (#38) * change_warpctc.cmake --- backends/metax_gpu/cmake/warpctc.cmake | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake index 0733c0f9ce5..ea8e2ade754 100644 --- a/backends/metax_gpu/cmake/warpctc.cmake +++ b/backends/metax_gpu/cmake/warpctc.cmake @@ -35,6 +35,13 @@ else() git checkout -- . && git checkout ${WARPCTC_TAG} && patch -Nd ${SOURCE_DIR} < ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.cuda.patch) + file(COPY ${CMAKE_SOURCE_DIR}/patch/intrinsics.cuh + DESTINATION ${SOURCE_DIR}/include/contrib/moderngpu/include/device/) + message(STATUS "atch file path: ${CMAKE_SOURCE_DIR}/patch/intrinsics.cuh") + message( + STATUS + "ModernGPU device path: ${SOURCE_DIR}/include/contrib/moderngpu/include/device/" + ) endif() if(NOT WIN32 AND WITH_GPU) From 2688c8664cc50961267be572ed467ce4b89bc351 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Thu, 18 Sep 2025 11:44:44 +0800 Subject: [PATCH 30/58] change_warpctc.cmake (#39) * change warpctc.cmake --- backends/metax_gpu/change_patch.sh | 3 ++- backends/metax_gpu/cmake/warpctc.cmake | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh index 60d74ec0f3d..f29986a3780 100644 --- a/backends/metax_gpu/change_patch.sh +++ b/backends/metax_gpu/change_patch.sh @@ -21,8 +21,9 @@ unzip mcEigen_3.4.0_paddle_final.zip mv mcEigen_3.4.0_paddle_final eigen3 cd .. cp -r patch/eigen3/ ../../Paddle/third_party/eigen3 +rm -r patch/eigen3 cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core cd ../../Paddle/ git apply --verbose ../backends/metax_gpu/patch/paddle.patch cd - -cp -r patch/intrinsics.cuh ../../Paddle/third_party/warpctc/include/contrib/moderngpu/include/device/ +# cp -r patch/intrinsics.cuh ../../Paddle/third_party/warpctc/include/contrib/moderngpu/include/device/ diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake index ea8e2ade754..5d668032fb1 100644 --- a/backends/metax_gpu/cmake/warpctc.cmake +++ b/backends/metax_gpu/cmake/warpctc.cmake @@ -108,6 +108,10 @@ else() set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) endif() +set(COPY_COMMAND + ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/patch/intrinsics.cuh" + "${SOURCE_DIR}/include/contrib/moderngpu/include/device/") + ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} @@ -117,6 +121,7 @@ ExternalProject_Add( PATCH_COMMAND COMMAND ${WARPCTC_PATCH_COMMAND} COMMAND ${WARPCTC_PATCH_CUDA_COMMAND} + COMMAND ${COPY_COMMAND} COMMAND ${WARPCTC_PATHCH_ROCM_COMMAND} # BUILD_ALWAYS 1 CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} From 6f031fe12a2020044b898b2b2921c899df3d4e3a Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Thu, 18 Sep 2025 12:10:23 +0800 Subject: [PATCH 31/58] test (#40) * test --------- --- backends/metax_gpu/tests/run_test.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh index 95cce650e6b..92dea2b492b 100755 --- a/backends/metax_gpu/tests/run_test.sh +++ b/backends/metax_gpu/tests/run_test.sh @@ -22,6 +22,8 @@ TEST_PATH1="${SCRIPT_DIR}/../../../python" TEST_PATH2="${SCRIPT_DIR}/../../../python/tests" export PYTHONPATH="${LEGACY_TEST_PATH}:${PYTHONPATH}:${TEST_PATH1}:${TEST_PATH2}" +export +sleep 1000000 rm -r build mkdir -p build && cd build From e84d399d6056f6dd017031514045a608e717b223 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Thu, 18 Sep 2025 14:42:12 +0800 Subject: [PATCH 32/58] test_ut (#41) * change_run_ut --------- --- backends/metax_gpu/tests/run_test.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh index 92dea2b492b..7d1e8e072a9 100755 --- a/backends/metax_gpu/tests/run_test.sh +++ b/backends/metax_gpu/tests/run_test.sh @@ -23,7 +23,8 @@ TEST_PATH2="${SCRIPT_DIR}/../../../python/tests" export PYTHONPATH="${LEGACY_TEST_PATH}:${PYTHONPATH}:${TEST_PATH1}:${TEST_PATH2}" export -sleep 1000000 +# sleep 1000000 + rm -r build mkdir -p build && cd build @@ -34,4 +35,4 @@ cmake .. cmake --build . -ctest -j1 --output-on-failure +ctest -j10 --output-on-failure From b5f2feb398cae8217d1dff39a5e7ef31afa0e02d Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Thu, 18 Sep 2025 16:59:28 +0800 Subject: [PATCH 33/58] tets (#43) * remove_tets --------- --- backends/metax_gpu/build.sh | 2 +- backends/metax_gpu/tests/CMakeLists.txt | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh index 042b779a05c..9ca589a7807 100755 --- a/backends/metax_gpu/build.sh +++ b/backends/metax_gpu/build.sh @@ -57,7 +57,7 @@ fi echo "make_maca" cd build -cmake_maca .. -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON +cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON make_maca -j60 echo "install whl" diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt index 410ef006514..08273782be6 100755 --- a/backends/metax_gpu/tests/CMakeLists.txt +++ b/backends/metax_gpu/tests/CMakeLists.txt @@ -81,8 +81,7 @@ list( ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py ${PADDLE_LEGACY_TEST_PATH}/test_full_op.py ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_reduce_op.py) + ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py) list( REMOVE_ITEM From e20eca7e6f9846583293e988b7484380a25f314f Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Thu, 18 Sep 2025 18:53:51 +0800 Subject: [PATCH 34/58] test (#44) * test --------- --- backends/metax_gpu/tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt index 08273782be6..795a3c5b8ac 100755 --- a/backends/metax_gpu/tests/CMakeLists.txt +++ b/backends/metax_gpu/tests/CMakeLists.txt @@ -95,7 +95,7 @@ list( ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py - # op_test.py 里 self._get_places()接口适配问题 + # op_test.py 里 self._get_places()接口的适配问题 ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py # device == "gpu" 适配问题 From e37f633a4d440a25126273ccddd7c3ff23288a02 Mon Sep 17 00:00:00 2001 From: jxwangmetax <189149612@qq.com> Date: Fri, 19 Sep 2025 18:30:47 +0800 Subject: [PATCH 35/58] [metax] modify compile (#42) * modify cmake for warpctc and warprnnt * modify conv for tf32 and fp32 * modify conv kernel * modify library to static library * modify kernel * modify fused_bias_dropout_residual_layer_norm * modify compile * modify blas --- backends/metax_gpu/CMakeLists.txt | 40 +- backends/metax_gpu/compile.sh | 2 +- .../kernels/funcs/blas/blas_impl.cu.h | 1270 ++++++++--------- .../fused_adam_kernel_register.cu | 0 ...esidual_layer_norm_grad_kernel_register.cu | 0 ...out_residual_layer_norm_kernel_register.cu | 0 ...dding_eltwise_layernorm_kernel_register.cu | 0 .../fused_layernorm_kernel_register.cu | 0 .../fused_seqpool_cvm_grad_kernel_register.cu | 0 .../fused_seqpool_cvm_kernel_register.cu | 0 ...fused_softmax_mask_grad_kernel_register.cu | 0 .../fused_softmax_mask_kernel_register.cu | 0 ...max_mask_upper_triangle_kernel_register.cu | 0 ...d_stack_transpose_quant_kernel_register.cu | 0 ...sed_swiglu_weighted_bwd_kernel_register.cu | 30 + .../fused_token_prune_kernel_register.cu | 0 ...d_transpose_split_quant_kernel_register.cu | 0 ...nspose_wlch_split_quant_kernel_register.cu | 0 .../kernels/metax_kernel/metax_context.cc | 35 - .../kernels/metax_kernel/metax_context.h | 2 - 20 files changed, 597 insertions(+), 782 deletions(-) mode change 100755 => 100644 backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_adam_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_bias_dropout_residual_layer_norm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_embedding_eltwise_layernorm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_layernorm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_seqpool_cvm_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_seqpool_cvm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_softmax_mask_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_softmax_mask_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_softmax_mask_upper_triangle_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_stack_transpose_quant_kernel_register.cu (100%) create mode 100644 backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_token_prune_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_transpose_split_quant_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_transpose_wlch_split_quant_kernel_register.cu (100%) diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index f282a9fbf7c..7b8c52f1f31 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -70,7 +70,6 @@ include(eigen) include(xxhash) include(zlib) include(protobuf) -include(generate_pb) set(PROTO_FILE "${PADDLE_SOURCE_DIR}/paddle/phi/core/external_error.proto") get_filename_component(PROTO_WE "${PROTO_FILE}" NAME_WE) @@ -614,12 +613,9 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math_function.cc ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/backends/context_pool.cc ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/repeat_tensor2index_tensor.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/binomial_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bernoulli_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_grad_kernel_impl.h - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cufft.cc ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/box_coder_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu @@ -642,29 +638,11 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gather_tree_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/graph_reindex_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/group_norm_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_act_dequant_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_weighted_swiglu_act_quant_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/fp8_gemm/fp8_gemm_with_cublasLt/fp8_fp8_half_gemm.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_grad_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/variable_length_memory_efficient_attention_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/gemm_epilogue_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/blha_get_max_len.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_grad_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/as_real_kernel.cc - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/as_complex_kernel.cc - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/complex_grad_kernel.cc - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/complex_kernel.cc - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/shape_kernel.cc - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/conv_kernel_igemm.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu # ############################################################################ ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu # kernels/kps @@ -697,7 +675,6 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/expand_modality_expert_id_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/int_bincount_kernel.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu) file( @@ -707,6 +684,8 @@ file( passes/*.cc kernels/*.cc kernels/*.cu + kernels/fusion/*.cc + kernels/fusion/*.cu kernels/gpudnn/*.cc kernels/gpudnn/*.cu kernels/cuda_kernels/*.cc @@ -721,13 +700,7 @@ set_source_files_properties(${CUSTOM_DEVICE_SRCS} PROPERTIES LANGUAGE CUDA) set(CMAKE_CUCC_COMPILER "cucc") set(CMAKE_CUCC_FLAGS "-I /opt/maca/tools/cu-bridge/include/") -set_source_files_properties( - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu - PROPERTIES LANGUAGE CUDA) -add_library( - ${TARGET_NAME} SHARED - ${CUSTOM_DEVICE_SRCS} - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu) +add_library(${TARGET_NAME} SHARED ${CUSTOM_DEVICE_SRCS}) target_include_directories( ${TARGET_NAME} @@ -753,9 +726,6 @@ target_link_libraries( ${WARPCTC_LIBRARIES} ${WARPRNNT_LIBRARIES} ${PADDLE_CORE_LIB}) -target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so) -target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so) -target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcpti.so) include_directories(BEFORE ${PADDLE_SOURCE_DIR}) target_compile_definitions( diff --git a/backends/metax_gpu/compile.sh b/backends/metax_gpu/compile.sh index e9860ccb7d0..eba45a9ced2 100644 --- a/backends/metax_gpu/compile.sh +++ b/backends/metax_gpu/compile.sh @@ -30,7 +30,7 @@ fi echo "make_maca" cd build -cmake_maca .. -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON +cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON make_maca -j10 diff --git a/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h b/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h old mode 100755 new mode 100644 index 419387cc9c4..ae4baa52613 --- a/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h +++ b/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h @@ -34,70 +34,6 @@ PHI_DECLARE_bool(gemm_use_half_precision_compute_type); namespace phi { namespace funcs { - -inline static cublasHandle_t blas_handle_ = nullptr; -inline static cublasHandle_t blas_tensor_core_handle_ = nullptr; -inline static cublasHandle_t blas_tf32_tensor_core_handle_ = nullptr; - -inline std::once_flag flag_sparse_; -inline std::once_flag flag_blas_; -inline std::once_flag flag_blaslt_; -inline std::once_flag flag_dnn_; -inline std::once_flag flag_solver_; -inline std::once_flag flag_cublas_; -inline std::once_flag flag_tensorcore_cublas_; -inline std::once_flag flag_eigen_device_; - -inline std::mutex blas_mtx_; -inline std::mutex blas_tensor_core_mtx_; -inline std::mutex blas_tf32_mtx_; -inline std::mutex sparse_mtx_; -inline std::mutex stream_call_back_mtx_; - -inline void InitBlasHandle(cublasHandle_t *blas_handle, gpuStream_t stream) { - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle)); - PADDLE_RETRY_CUDA_SUCCESS( - phi::dynload::cublasSetStream(*blas_handle, stream)); -} - -inline void CublasCall(const std::function &callback, - phi::stream::stream_t stream) { - std::call_once(flag_cublas_, [&]() { - if (!blas_handle_) InitBlasHandle(&blas_handle_, stream); - if (!blas_tensor_core_handle_) { - InitBlasHandle(&blas_tensor_core_handle_, stream); - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( - blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH)); - } - }); - std::lock_guard guard(blas_mtx_); - callback(blas_handle_); -} - -inline bool MetaxTensorCoreAvailable() { - return blas_tensor_core_handle_ != nullptr; -} - -inline void TensorCoreCublasCallIfAvailable( - const std::function &callback, - phi::stream::stream_t stream) { - std::call_once(flag_tensorcore_cublas_, [&]() { - if (!blas_handle_) InitBlasHandle(&blas_handle_, stream); - if (!blas_tensor_core_handle_) { - InitBlasHandle(&blas_tensor_core_handle_, stream); - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( - blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH)); - } - }); - if (blas_tensor_core_handle_ != nullptr) { - std::lock_guard guard(blas_tensor_core_mtx_); - callback(blas_tensor_core_handle_); - } else { - std::lock_guard guard(blas_mtx_); - callback(blas_handle_); - } -} - template struct CUBlas; @@ -174,28 +110,26 @@ struct CUBlas { // here. #if CUDA_VERSION >= 8000 VLOG(5) << "use_tensor_op_math: " - << (MetaxTensorCoreAvailable() ? "True" : "False"); - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemmEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc)); - }, - dev_ctx->stream()); + << (dev_ctx->tensor_core_available() ? "True" : "False"); + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemmEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc)); + }); #else PADDLE_THROW(phi::errors::Unimplemented( "cublasSgemmEx is not supported on cuda <= 7.5")); @@ -376,7 +310,7 @@ struct CUBlas { #if CUDA_VERSION >= 8000 cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; #if CUDA_VERSION >= 9000 - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx->tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -386,31 +320,29 @@ struct CUBlas { thrust::device_vector A_ptr(A, A + batchCount); thrust::device_vector B_ptr(B, B + batchCount); thrust::device_vector C_ptr(C, C + batchCount); - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cublasGemmBatchedEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A_ptr.data().get(), - Atype, - lda, - B_ptr.data().get(), - Btype, - ldb, - beta, - C_ptr.data().get(), - Ctype, - ldc, - batchCount, - computeType, - algo)); - }, - dev_ctx->stream()); + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cublasGemmBatchedEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A_ptr.data().get(), + Atype, + lda, + B_ptr.data().get(), + Btype, + ldb, + beta, + C_ptr.data().get(), + Ctype, + ldc, + batchCount, + computeType, + algo)); + }); #else PADDLE_THROW(phi::errors::Unimplemented( "cublasGemmBatchedEx is not supported on cuda <= 7.5")); @@ -486,7 +418,7 @@ struct CUBlas { #if CUDA_VERSION >= 8000 cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; #if CUDA_VERSION >= 9000 - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx->tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -494,29 +426,27 @@ struct CUBlas { << (use_tensor_op_math ? "True" : "False"); #endif // CUDA_VERSION >= 9000 - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - computeType, - algo)); - }, - dev_ctx->stream()); + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc, + computeType, + algo)); + }); #else PADDLE_THROW(phi::errors::Unimplemented( "cublasGemmEx is not supported on cuda <= 7.5")); @@ -696,7 +626,7 @@ struct CUBlas> { #if CUDA_VERSION >= 8000 cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; #if CUDA_VERSION >= 9000 - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx->tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -704,29 +634,27 @@ struct CUBlas> { << (use_tensor_op_math ? "True" : "False"); #endif // CUDA_VERSION >= 9000 - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - computeType, - algo)); - }, - dev_ctx->stream()); + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc, + computeType, + algo)); + }); #else PADDLE_THROW(phi::errors::Unimplemented( "cublasGemmEx is not supported on cuda <= 7.5")); @@ -1024,7 +952,7 @@ struct CUBlas> { #if CUDA_VERSION >= 8000 cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; #if CUDA_VERSION >= 9000 - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx->tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -1032,29 +960,27 @@ struct CUBlas> { << (use_tensor_op_math ? "True" : "False"); #endif // CUDA_VERSION >= 9000 - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - computeType, - algo)); - }, - dev_ctx->stream()); + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc, + computeType, + algo)); + }); #else PADDLE_THROW(phi::errors::Unimplemented( "cublasGemmEx is not supported on cuda <= 7.5")); @@ -1186,24 +1112,22 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, PADDLE_THROW(common::errors::Unimplemented( "GEMM_EX_64 is not supported on cuda < 12.3")); } else { - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - N); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + ldb, + A, + lda, + &beta, + C, + N); + }); } #if CUDA_VERSION >= 8000 @@ -1271,24 +1195,22 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &h_alpha, - h_B, - ldb, - h_A, - lda, - &h_beta, - h_C, - N); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + h_B, + ldb, + h_A, + lda, + &h_beta, + h_C, + N); + }); #endif // CUDA_VERSION >= 8000 } @@ -1352,24 +1274,22 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, PADDLE_THROW(common::errors::Unimplemented( "GEMM_EX_64 is not supported on cuda < 12.3")); } else { - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &t_alpha, - B, - static_cast(ldb), - A, - static_cast(lda), - &t_beta, - C, - static_cast(N)); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &t_alpha, + B, + static_cast(ldb), + A, + static_cast(lda), + &t_beta, + C, + static_cast(N)); + }); } #if CUDA_VERSION >= 8000 @@ -1447,24 +1367,22 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, CUBLAS_COMPUTE_32F); #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &h_alpha, - h_B, - static_cast(ldb), - h_A, - static_cast(lda), - &h_beta, - h_C, - static_cast(N)); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &h_alpha, + h_B, + static_cast(ldb), + h_A, + static_cast(lda), + &h_beta, + h_C, + static_cast(N)); + }); #endif // CUDA_VERSION >= 8000 } } @@ -1503,7 +1421,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, float h_beta = static_cast(beta); cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx_.tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -1519,30 +1437,27 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 12030 } else { CheckGEMMNSize(N); - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cublasGemmEx(handle, - cuTransB, - cuTransA, - N, - M, - K, - &h_alpha, - B, - CUDA_R_16BF, - ldb, - A, - CUDA_R_16BF, - lda, - &h_beta, - C, - CUDA_R_16BF, - N, - CUBLAS_COMPUTE_32F, - algo)); - }, - dev_ctx_.stream()); + dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + B, + CUDA_R_16BF, + ldb, + A, + CUDA_R_16BF, + lda, + &h_beta, + C, + CUDA_R_16BF, + N, + CUBLAS_COMPUTE_32F, + algo)); + }); } #else // raise error @@ -1621,24 +1536,22 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - CublasCall( - [&](cublasHandle_t handle) { - CUBlas>::GEMM(handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &c_alpha, - h_B, - static_cast(ldb), - h_A, - static_cast(lda), - &c_beta, - h_C, - static_cast(N)); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas>::GEMM(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &c_alpha, + h_B, + static_cast(ldb), + h_A, + static_cast(lda), + &c_beta, + h_C, + static_cast(N)); + }); #endif // CUDA_VERSION >= 8000 } } @@ -1713,24 +1626,22 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - CublasCall( - [&](cublasHandle_t handle) { - CUBlas>::GEMM(handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &c_alpha, - h_B, - static_cast(ldb), - h_A, - static_cast(lda), - &c_beta, - h_C, - static_cast(N)); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas>::GEMM(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &c_alpha, + h_B, + static_cast(ldb), + h_A, + static_cast(lda), + &c_beta, + h_C, + static_cast(N)); + }); #endif // CUDA_VERSION >= 8000 } } @@ -1769,7 +1680,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, float h_beta = beta; cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx_.tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -1784,30 +1695,28 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 12030 } else { CheckGEMMNSize(N); - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cublasGemmEx(handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &h_alpha, - B, - CUDA_R_16BF, - static_cast(ldb), - A, - CUDA_R_16BF, - static_cast(lda), - &h_beta, - C, - CUDA_R_16BF, - static_cast(N), - CUDA_R_32F, - algo)); - }, - dev_ctx_.stream()); + dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cublasGemmEx(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &h_alpha, + B, + CUDA_R_16BF, + static_cast(ldb), + A, + CUDA_R_16BF, + static_cast(lda), + &h_beta, + C, + CUDA_R_16BF, + static_cast(N), + CUDA_R_32F, + algo)); + }); } #else // raise error @@ -1860,24 +1769,22 @@ void Blas::GEMM(bool transA, } else { #endif // CUDA_VERSION >= 8000 - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - ldc); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + ldb, + A, + lda, + &beta, + C, + ldc); + }); #if CUDA_VERSION >= 8000 } @@ -1904,24 +1811,22 @@ inline void Blas::GEMM(bool transA, cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - ldc); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + ldb, + A, + lda, + &beta, + C, + ldc); + }); } template <> @@ -1957,36 +1862,33 @@ inline void Blas::GEMM(bool transA, float h_beta = static_cast(beta); cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx_.tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cublasGemmEx(handle, - cuTransB, - cuTransA, - N, - M, - K, - &h_alpha, - B, - CUDA_R_16BF, - ldb, - A, - CUDA_R_16BF, - lda, - &h_beta, - C, - CUDA_R_16BF, - ldc, - CUBLAS_COMPUTE_32F, - algo)); - }, - dev_ctx_.stream()); + dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + B, + CUDA_R_16BF, + ldb, + A, + CUDA_R_16BF, + lda, + &h_beta, + C, + CUDA_R_16BF, + ldc, + CUBLAS_COMPUTE_32F, + algo)); + }); #else // raise error PADDLE_THROW(phi::errors::Unimplemented( @@ -1998,27 +1900,23 @@ inline void Blas::GEMM(bool transA, template <> template void Blas::AXPY(int n, T alpha, const T *x, T *y) const { - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::AXPY(handle, n, &alpha, x, 1, y, 1); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::AXPY(handle, n, &alpha, x, 1, y, 1); + }); } template <> template void Blas::SCAL(int n, const T alpha, T *x) const { - CublasCall( - [&](cublasHandle_t handle) { CUBlas::SCAL(handle, n, &alpha, x, 1); }, - dev_ctx_.stream()); + dev_ctx_.CublasCall( + [&](cublasHandle_t handle) { CUBlas::SCAL(handle, n, &alpha, x, 1); }); } template <> template void Blas::VCOPY(int n, const T *x, T *y) const { - CublasCall( - [&](cublasHandle_t handle) { CUBlas::VCOPY(handle, n, x, 1, y, 1); }, - dev_ctx_.stream()); + dev_ctx_.CublasCall( + [&](cublasHandle_t handle) { CUBlas::VCOPY(handle, n, x, 1, y, 1); }); } template <> @@ -2033,12 +1931,9 @@ void Blas::GEMV(bool trans_a, T *C) const { cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMV( - handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); + }); } template <> @@ -2112,7 +2007,7 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same::value)) || std::is_same::value) { cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx_.tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -2153,60 +2048,56 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3")); #endif // CUDA_VERSION >= 12030 } else { - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cublasGemmStridedBatchedEx(handle, - cuTransB, - cuTransA, - N, - M, - K, - a, - B, - fp, - ldb, - strideB, - A, - fp, - lda, - strideA, - b, - C, - fp, - ldc, - strideC, - batchCount, - compute_type, - algo)); - }, - dev_ctx_.stream()); + dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cublasGemmStridedBatchedEx(handle, + cuTransB, + cuTransA, + N, + M, + K, + a, + B, + fp, + ldb, + strideB, + A, + fp, + lda, + strideA, + b, + C, + fp, + ldc, + strideC, + batchCount, + compute_type, + algo)); + }); } } else { #endif // CUDA_VERSION >= 9010 - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM_STRIDED_BATCH(handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &alpha, - B, - static_cast(ldb), - strideB, - A, - static_cast(lda), - strideA, - &beta, - C, - ldc, - strideC, - static_cast(batchCount)); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM_STRIDED_BATCH(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &alpha, + B, + static_cast(ldb), + strideB, + A, + static_cast(lda), + strideA, + &beta, + C, + ldc, + strideC, + static_cast(batchCount)); + }); #if CUDA_VERSION >= 9010 } @@ -2242,7 +2133,7 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same::value)) || std::is_same::value) { cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx_.tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -2284,61 +2175,57 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3")); #endif // CUDA_VERSION >= 12030 } else { - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx( - handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - a, - B, - fp, - static_cast(ldb), - strideB, - A, - fp, - static_cast(lda), - strideA, - b, - C, - fp, - static_cast(ldc), - strideC, - static_cast(batchCount), - compute_type, - algo)); - }, - dev_ctx_.stream()); + dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx( + handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + a, + B, + fp, + static_cast(ldb), + strideB, + A, + fp, + static_cast(lda), + strideA, + b, + C, + fp, + static_cast(ldc), + strideC, + static_cast(batchCount), + compute_type, + algo)); + }); } } else { #endif // CUDA_VERSION >= 9010 T h_alpha = static_cast(alpha); T h_beta = static_cast(beta); - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM_STRIDED_BATCH(handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &h_alpha, - B, - static_cast(ldb), - strideB, - A, - static_cast(lda), - strideA, - &h_beta, - C, - static_cast(ldc), - strideC, - static_cast(batchCount)); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM_STRIDED_BATCH(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &h_alpha, + B, + static_cast(ldb), + strideB, + A, + static_cast(lda), + strideA, + &h_beta, + C, + static_cast(ldc), + strideC, + static_cast(batchCount)); + }); #if CUDA_VERSION >= 9010 } @@ -2377,7 +2264,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, float h_beta = static_cast(beta); cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx_.tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -2392,34 +2279,32 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3")); #endif // CUDA_VERSION >= 12030 } else { - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx( - handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &h_alpha, - B, - CUDA_R_16BF, - static_cast(ldb), - strideB, - A, - CUDA_R_16BF, - static_cast(lda), - strideA, - &h_beta, - C, - CUDA_R_16BF, - static_cast(ldc), - strideC, - static_cast(batchCount), - CUBLAS_COMPUTE_32F, - algo)); - }, - dev_ctx_.stream()); + dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cublasGemmStridedBatchedEx(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &h_alpha, + B, + CUDA_R_16BF, + static_cast(ldb), + strideB, + A, + CUDA_R_16BF, + static_cast(lda), + strideA, + &h_beta, + C, + CUDA_R_16BF, + static_cast(ldc), + strideC, + static_cast(batchCount), + CUBLAS_COMPUTE_32F, + algo)); + }); } #else // raise error @@ -2460,7 +2345,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, float h_beta = beta; cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx_.tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -2475,34 +2360,32 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3")); #endif // CUDA_VERSION >= 12030 } else { - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx( - handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &h_alpha, - B, - CUDA_R_16BF, - static_cast(ldb), - strideB, - A, - CUDA_R_16BF, - static_cast(lda), - strideA, - &h_beta, - C, - CUDA_R_16BF, - static_cast(ldc), - strideC, - static_cast(batchCount), - CUBLAS_COMPUTE_32F, - algo)); - }, - dev_ctx_.stream()); + dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cublasGemmStridedBatchedEx(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &h_alpha, + B, + CUDA_R_16BF, + static_cast(ldb), + strideB, + A, + CUDA_R_16BF, + static_cast(lda), + strideA, + &h_beta, + C, + CUDA_R_16BF, + static_cast(ldc), + strideC, + static_cast(batchCount), + CUBLAS_COMPUTE_32F, + algo)); + }); } #else // raise error @@ -2547,7 +2430,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, // (std::is_same::value)) || // std::is_same::value) { // cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; -// bool use_tensor_op_math = MetaxTensorCoreAvailable(); +// bool use_tensor_op_math = dev_ctx_.tensor_core_available(); // if (use_tensor_op_math) { // algo = CUBLAS_GEMM_DFALT_TENSOR_OP; // } @@ -2579,7 +2462,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, // #endif // } -// TensorCoreCublasCallIfAvailable( +// dev_ctx_.TensorCoreCublasCallIfAvailable( // [&](cublasHandle_t handle) { // PADDLE_ENFORCE_GPU_SUCCESS( // phi::dynload::cublasGemmStridedBatchedEx(handle, @@ -2605,12 +2488,11 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, // batchCount, // compute_type, // algo)); -// }, -// dev_ctx_.stream()); +// }); // } else { // #endif // CUDA_VERSION >= 9010 -// CublasCall( +// dev_ctx_.CublasCall( // [&](cublasHandle_t handle) { // CUBlas::GEMM_STRIDED_BATCH(handle, // cuTransB, @@ -2667,7 +2549,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, // cublasOperation_t cuTransB = // (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; // const int64_t strideC = M * N; -// CublasCall( +// dev_ctx_.CublasCall( // [&](cublasHandle_t handle) { // PADDLE_ENFORCE_GPU_SUCCESS( // phi::dynload::cublasDgemmStridedBatched(handle, @@ -2723,14 +2605,14 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, // float h_beta = static_cast(beta); // cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; -// bool use_tensor_op_math = MetaxTensorCoreAvailable(); +// bool use_tensor_op_math = dev_ctx->tensor_core_available(); // if (use_tensor_op_math) { // algo = CUBLAS_GEMM_DFALT_TENSOR_OP; // } // VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : // "False"); -// TensorCoreCublasCallIfAvailable( +// dev_ctx_.TensorCoreCublasCallIfAvailable( // [&](cublasHandle_t handle) { // PADDLE_ENFORCE_GPU_SUCCESS( // phi::dynload::cublasGemmStridedBatchedEx(handle, @@ -2756,8 +2638,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, // batchCount, // CUBLAS_COMPUTE_32F, // algo)); -// }, -// dev_ctx_.stream()); +// }); // #else // // raise error // PADDLE_THROW(phi::errors::Unimplemented( @@ -2812,25 +2693,23 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, thrust::device_vector B_ptr(B, B + batchCount); thrust::device_vector C_ptr(C, C + batchCount); - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM_BATCH(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B_ptr.data().get(), - ldb, - A_ptr.data().get(), - lda, - &beta, - C_ptr.data().get(), - ldc, - batchCount); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM_BATCH(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B_ptr.data().get(), + ldb, + A_ptr.data().get(), + lda, + &beta, + C_ptr.data().get(), + ldc, + batchCount); + }); } template <> @@ -2859,25 +2738,23 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, thrust::device_vector B_ptr(B, B + batchCount); thrust::device_vector C_ptr(C, C + batchCount); - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM_BATCH(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B_ptr.data().get(), - ldb, - A_ptr.data().get(), - lda, - &beta, - C_ptr.data().get(), - ldc, - batchCount); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM_BATCH(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B_ptr.data().get(), + ldb, + A_ptr.data().get(), + lda, + &beta, + C_ptr.data().get(), + ldc, + batchCount); + }); } template <> @@ -2970,7 +2847,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, float f_beta = static_cast(beta); cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx_.tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -2979,31 +2856,29 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, thrust::device_vector A_ptr(A, A + batchCount); thrust::device_vector B_ptr(B, B + batchCount); thrust::device_vector C_ptr(C, C + batchCount); - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cublasGemmBatchedEx(handle, - cuTransB, - cuTransA, - N, - M, - K, - &f_alpha, - B_ptr.data().get(), - CUDA_R_16BF, - ldb, - A_ptr.data().get(), - CUDA_R_16BF, - lda, - &f_beta, - C_ptr.data().get(), - CUDA_R_16BF, - ldc, - batchCount, - CUBLAS_COMPUTE_32F, - algo)); - }, - dev_ctx_.stream()); + dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cublasGemmBatchedEx(handle, + cuTransB, + cuTransA, + N, + M, + K, + &f_alpha, + B_ptr.data().get(), + CUDA_R_16BF, + ldb, + A_ptr.data().get(), + CUDA_R_16BF, + lda, + &f_beta, + C_ptr.data().get(), + CUDA_R_16BF, + ldc, + batchCount, + CUBLAS_COMPUTE_32F, + algo)); + }); #else // raise error PADDLE_THROW(phi::errors::Unimplemented( @@ -3038,33 +2913,19 @@ void Blas::TRSM(CBLAS_SIDE side, cublasDiagType_t cuDiag = (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::TRSM(handle, - cuSide, - cuUplo, - cuTransA, - cuDiag, - N, - M, - &alpha, - A, - lda, - B, - ldb); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::TRSM( + handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb); + }); } template <> template void Blas::BatchedGETRF( int n, T **a, int *ipiv, int *info, int batch_size) const { - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size); + }); } template <> @@ -3084,23 +2945,18 @@ void Blas::BatchedGETRI(int n, "overlap memory space of input matrix (address: %p).", a_inv, a)); - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GETRI_BATCH( - handle, n, a, n, ipiv, a_inv, n, info, batch_size); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size); + }); } template <> template void Blas::BatchedMatInv( int n, const T **a, T **a_inv, int *info, int batch_size) const { - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size); + }); } template <> @@ -3118,12 +2974,10 @@ void Blas::BatchedGETRS(CBLAS_TRANSPOSE trans, // use CUBLAS_OP_C (conjugate transpose) for complex cublasOperation_t cuTrans = (trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GETRS_BATCH( - handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GETRS_BATCH( + handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size); + }); } template <> @@ -3152,23 +3006,21 @@ void Blas::BatchedTRSM(CBLAS_SIDE side, cublasDiagType_t cuDiag = (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::TRSM_BATCH(handle, - cuSide, - cuUplo, - cuTransA, - cuDiag, - N, - M, - &alpha, - A, - lda, - B, - ldb, - batch_size); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::TRSM_BATCH(handle, + cuSide, + cuUplo, + cuTransA, + cuDiag, + N, + M, + &alpha, + A, + lda, + B, + ldb, + batch_size); + }); } } // namespace funcs diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_adam_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_adam_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_adam_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_adam_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_embedding_eltwise_layernorm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_embedding_eltwise_layernorm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_embedding_eltwise_layernorm_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_embedding_eltwise_layernorm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_layernorm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_layernorm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_layernorm_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_layernorm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_grad_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_softmax_mask_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_grad_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_softmax_mask_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_softmax_mask_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_softmax_mask_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_upper_triangle_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_softmax_mask_upper_triangle_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_upper_triangle_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_softmax_mask_upper_triangle_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_stack_transpose_quant_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_stack_transpose_quant_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_stack_transpose_quant_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_stack_transpose_quant_kernel_register.cu diff --git a/backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu new file mode 100644 index 00000000000..08876233bfb --- /dev/null +++ b/backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu @@ -0,0 +1,30 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/fusion/gpu/fused_swiglu_weighted_bwd_kernel.cu" //NOLINT + +PD_CUSTOM_KERNEL_REGISTER(fused_swiglu_weighted_bwd, + metax_gpu, + ALL_LAYOUT, + phi::FusedSwigluWeightedBwdKernel, + float, + double, + int, + int64_t, + phi::bfloat16) { + kernel->OutputAt(0).SetDataType(phi::DataType::BFLOAT16); + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::BFLOAT16); +} diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_token_prune_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_token_prune_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_token_prune_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_token_prune_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_transpose_split_quant_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_transpose_split_quant_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_transpose_split_quant_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_transpose_split_quant_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_transpose_wlch_split_quant_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_transpose_wlch_split_quant_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_transpose_wlch_split_quant_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_transpose_wlch_split_quant_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc index 62aaa5fb2de..a388387de45 100644 --- a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc +++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc @@ -15,25 +15,6 @@ #include "kernels/metax_kernel/metax_context.h" namespace phi { -const bool allow_tf32_cublas = []() -> bool { - const char* v = std::getenv("ALLOW_TF32_CUBLAS"); - if (v) { - return std::atoi(v); - } - return false; -}(); - -const bool allow_tf32_cudnn = []() -> bool { - const char* v = std::getenv("ALLOW_TF32_CUDNN"); - if (v) { - return std::atoi(v); - } - return false; -}(); - -bool AllowTF32Cublas() { return allow_tf32_cublas; } -bool AllowTF32Cudnn() { return allow_tf32_cudnn; } - void DnnWorkspaceHandle::RunFuncSync( const std::function& cudnn_func, size_t required_workspace_bytes, @@ -87,20 +68,4 @@ static void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) { phi::dynload::hipblasLtCreate(blaslt_handle); #endif } - -blasLtHandle_t GetBlasLtHandle() { - std::call_once(flag_blaslt_, [&]() { - if (!blaslt_handle_) { - if (!blaslt_handle_creator_) - InitBlasLtHandle(&blaslt_handle_); - else - blaslt_handle_ = blaslt_handle_creator_(); - } - }); - PADDLE_ENFORCE_NOT_NULL( - blaslt_handle_, - common::errors::InvalidArgument( - "The GPU blasLt handle is nullptr. It must not be null.")); - return blaslt_handle_; -} } // namespace phi diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h index a6610c1dab2..2339e18a4a6 100644 --- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h +++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h @@ -128,8 +128,6 @@ inline void InitCusolverDnHandle(cusolverDnHandle_t* handle, } } -bool AllowTF32Cublas(); -bool AllowTF32Cudnn(); inline cusolverDnHandle_t GetCusolverDnHandle(gpuStream_t stream, Place place) { std::call_once(flag_cusolver_dn_, [&]() { if (!cusolver_dn_handle_) { From 1af5148d20ce28e202fb0ac672f266c807d98b17 Mon Sep 17 00:00:00 2001 From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Date: Fri, 19 Sep 2025 18:31:14 +0800 Subject: [PATCH 36/58] [Metax] add log analysis script (#46) * [Metax] fix dgc & mklml compile product path problem * [Metax] update metax_gpu CMakeLists.txt * [Metax] organize documents * [Metax] add log analysis script --- .../metax_gpu/tests/scripts/classify.json | 22 ++ .../metax_gpu/tests/scripts/log_analysis.py | 216 ++++++++++++++++++ 2 files changed, 238 insertions(+) create mode 100644 backends/metax_gpu/tests/scripts/classify.json create mode 100644 backends/metax_gpu/tests/scripts/log_analysis.py diff --git a/backends/metax_gpu/tests/scripts/classify.json b/backends/metax_gpu/tests/scripts/classify.json new file mode 100644 index 00000000000..b97255adc3d --- /dev/null +++ b/backends/metax_gpu/tests/scripts/classify.json @@ -0,0 +1,22 @@ +{ + "OK":{ + "skipped":{ + "rule":["skipped="] + } + }, + + "FAILED":{ + "precision":{ + "rule":["Mismatched elements"] + }, + "api":{ + "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace", "ValueError: The API paddle.device.cuda.get_device_properties", "TypeError: paddle.index_add api"] + }, + "missing":{ + "rule":["missing metax_gpu kernel", "UnimplementedError: There are no kernels which are registered"] + }, + "file_not_found":{ + "rule":["FileNotFoundError:"] + } + } +} diff --git a/backends/metax_gpu/tests/scripts/log_analysis.py b/backends/metax_gpu/tests/scripts/log_analysis.py new file mode 100644 index 00000000000..c0716f5b6f5 --- /dev/null +++ b/backends/metax_gpu/tests/scripts/log_analysis.py @@ -0,0 +1,216 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import fnmatch +import shutil +from enum import Enum + + +class TestResult(Enum): + OK = "OK" + FAILURE = "FAILED" + + +class LogAnalyzer: + def __init__( + self, + classify_file: str, + search_path: str, + pattern: str = None, + encoding: str = "utf-8", + ): + self.__patten = pattern + self.__search_path = search_path + self.__encoding = encoding + self.__statistical_data = {} + + self.__classify_data = self.__read_json_file(classify_file) + for key, value in self.__classify_data.items(): + self.__statistical_data[key] = {} + for sub_key in list(value.keys()): + self.__statistical_data[key][sub_key] = [] + + self.__statistical_data[TestResult.OK.value]["noskip"] = [] + self.__statistical_data[TestResult.FAILURE.value]["other"] = [] + + def __read_json_file(self, path: str) -> dict: + with open(path, "r", encoding=self.__encoding) as f: + data = json.load(f) + f.close() + return data + + def __check_path(self, path: str) -> None: + """ + 处理指定路径: + - 若为文件夹路径:不存在则创建,存在则清空内容 + - 若为文件路径:不存在则创建,存在则清空内容 + """ + try: + # 判断路径是否存在 + if os.path.exists(path): + # 路径存在,判断是文件还是文件夹 + if os.path.isfile(path): + # 处理文件:清空内容 + with open(path, "w", encoding="utf-8") as f: + f.write("") # 写入空内容清空文件 + # print(f"文件已存在,已清空内容: {path}") + + elif os.path.isdir(path): + # 处理文件夹:清空所有内容 + for item in os.listdir(path): + item_path = os.path.join(path, item) + if os.path.isfile(item_path) or os.path.islink(item_path): + os.remove(item_path) # 删除文件或链接 + elif os.path.isdir(item_path): + shutil.rmtree(item_path) # 递归删除子文件夹 + # print(f"文件夹已存在,已清空内容: {path}") + else: + # 路径不存在,判断目标类型(根据最后一个元素是否有扩展名) + # 获取路径的最后一部分 + last_part = os.path.basename(path) + + # 判断是否为文件路径(包含扩展名) + if "." in last_part and not last_part.endswith("."): + # 创建文件(包括父目录) + parent_dir = os.path.dirname(path) + if parent_dir and not os.path.exists(parent_dir): + os.makedirs(parent_dir, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + pass # 创建空文件 + # print(f"文件不存在,已创建: {path}") + + else: + # 创建文件夹(支持多级目录) + os.makedirs(path, exist_ok=True) + # print(f"文件夹不存在,已创建: {path}") + + except PermissionError: + print(f"权限错误:无法操作路径 {path}") + except Exception as e: + print(f"处理路径时发生错误: {str(e)}") + + def save_result(self, dir_path: str = "./") -> None: + """ + 判断文件夹是否存在: + - 不存在则创建 + - 存在则清空文件夹内所有内容(保留文件夹本身) + """ + + for key, value in self.__statistical_data.items(): + sub_dir = os.path.join(dir_path, key) + self.__check_path(sub_dir) + + for sub_key, sub_value in value.items(): + # print(f"{sub_key}: {len(value[sub_key])} - ({sub_value})") + try: + with open( + os.path.join(sub_dir, sub_key) + ".txt", "w", encoding="utf-8" + ) as f: + for op_name in sub_value: + if not op_name.endswith("\n"): + op_name += "\n" + f.write(op_name) + # print(f"内容已成功{'追加' if append else '写入'}到 {file_path}") + except Exception as e: + print(f"写入文件失败: {e}") + + def show_result(self) -> None: + test_counts = 0 + for key, value in self.__statistical_data.items(): + print(f"\n---------- {key} ----------") + for sub_key, sub_value in value.items(): + test_counts = test_counts + len(value[sub_key]) + print(f"{sub_key}: {len(value[sub_key])}\n\t{sub_value}\n") + print( + f"\n******************* Total log num: {test_counts} *******************\n\n" + ) + + def run(self): + """ + 读取指定目录下符合命名规则的文件,并遍历每一行 + + 参数: + search_path: 要搜索的根目录 + pattern: 文件名匹配规则(支持通配符,如 '*.txt', 'file_*.log') + """ + for dirpath, dirnames, filenames in os.walk(self.__search_path): + for filename in fnmatch.filter(filenames, self.__patten): + file_path = os.path.join(dirpath, filename) + # print(f"\n===== 正在处理文件: {file_path} =====") + + cur_res_type = TestResult.FAILURE + cur_sub_type = "other" + pre_line = None + finish_early = False + + try: + with open(file_path, "r", encoding=self.__encoding) as f: + for line in f: + for sub_type, sub_type_params in self.__classify_data[ + cur_res_type.value + ].items(): + for keyword in sub_type_params["rule"]: + if keyword in line: + cur_sub_type = sub_type + if sub_type == "missing": + finish_early = True + break + + if finish_early: + break + + pre_line = line + if finish_early: + break + + if "OK" in pre_line: + cur_res_type = TestResult.OK + cur_sub_type = None + for sub_type, sub_type_params in self.__classify_data[ + cur_res_type.value + ].items(): + for rule in sub_type_params["rule"]: + if rule in line: + cur_sub_type = sub_type + + op_name = filename.split(".") + if cur_sub_type is None: + self.__statistical_data[cur_res_type.value][ + "noskip" + ].append(op_name[0]) + else: + self.__statistical_data[cur_res_type.value][ + cur_sub_type + ].append(op_name[0]) + # print(f"Result: {cur_res_type.value}, type: {cur_sub_type}") + f.close() + except UnicodeDecodeError: + print(f"警告: 文件 {file_path} 编码不是 utf-8,跳过处理") + except Exception as e: + print(f"处理文件 {file_path} 时出错: {str(e)}") + + +if __name__ == "__main__": + + analyzer = LogAnalyzer( + classify_file="./classify.json", + search_path="./NPU_logs/20250918_065326", + pattern="test_*.log", + ) + + analyzer.run() + analyzer.show_result() + analyzer.save_result("./output") From 518bee8382cdb7879f38e8b81e719aa8853b825e Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Fri, 19 Sep 2025 19:07:47 +0800 Subject: [PATCH 37/58] add_generate_pb (#47) * add_generate_pb --------- --- backends/metax_gpu/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index 7b8c52f1f31..78b4c9c566b 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -70,6 +70,7 @@ include(eigen) include(xxhash) include(zlib) include(protobuf) +include(generate_pb) set(PROTO_FILE "${PADDLE_SOURCE_DIR}/paddle/phi/core/external_error.proto") get_filename_component(PROTO_WE "${PROTO_FILE}" NAME_WE) From bc02549e7450cffb6b6925ef199b6f6fcbd63259 Mon Sep 17 00:00:00 2001 From: jxwangmetax <189149612@qq.com> Date: Mon, 22 Sep 2025 16:44:28 +0800 Subject: [PATCH 38/58] modify blas (#51) * modify cmake for warpctc and warprnnt * modify conv for tf32 and fp32 * modify conv kernel * modify library to static library * modify kernel * modify fused_bias_dropout_residual_layer_norm * modify compile * modify blas * modify blas * modify blas * modify blas --- backends/metax_gpu/CMakeLists.txt | 1 + .../metax_gpu/kernels/metax_kernel/metax_context.cc | 12 ------------ .../metax_gpu/kernels/metax_kernel/metax_context.h | 4 +--- backends/metax_gpu/patch/paddle.patch | 1 - 4 files changed, 2 insertions(+), 16 deletions(-) diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index 78b4c9c566b..b98f2bcc919 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -733,6 +733,7 @@ target_compile_definitions( ${TARGET_NAME} PUBLIC PADDLE_WITH_CUDA=1 PADDLE_WITH_CUSTOM_DEVICE=1 + mcblasContext=cublasContext GPUContext=CustomContext KPSContext=CustomContext STREAM_TYPE=cudaStream_t diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc index a388387de45..6d86c81041f 100644 --- a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc +++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc @@ -56,16 +56,4 @@ void DnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) { allocation_.reset(); allocation_ = allocator_->Allocate(required_workspace_bytes); } - -static std::function blaslt_handle_creator_{nullptr}; -static blasLtHandle_t blaslt_handle_{nullptr}; -static std::once_flag flag_blaslt_; - -static void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) { -#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 - mcblasLtCreate(blaslt_handle); -#elif defined(PADDLE_WITH_HIP) - phi::dynload::hipblasLtCreate(blaslt_handle); -#endif -} } // namespace phi diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h index 2339e18a4a6..376981f27a4 100644 --- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h +++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h @@ -27,9 +27,7 @@ #include "paddle/phi/core/attribute.h" #include "paddle/phi/core/device_context.h" -using blasLtHandle_t = struct mcblasLtContext*; - -blasLtHandle_t GetBlasLtHandle(); +cublasLtHandle_t GetBlasLtHandle(); namespace phi { class DnnWorkspaceHandle { diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index b7bdb953077..beefb730bf7 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -488,7 +488,6 @@ index 4eae698648..5c047723ea 100644 #endif return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize; } - diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h index 15e1a4a3c3..e4780538d7 100644 --- a/paddle/phi/kernels/funcs/math/context_project.h From 1977ca87be51518f59506d37c08790938e4c1345 Mon Sep 17 00:00:00 2001 From: jxwangmetax <189149612@qq.com> Date: Mon, 22 Sep 2025 17:31:21 +0800 Subject: [PATCH 39/58] [metax] modify tf32 (#52) * modify cmake for warpctc and warprnnt * modify conv for tf32 and fp32 * modify conv kernel * modify library to static library * modify kernel * modify fused_bias_dropout_residual_layer_norm * modify compile * modify blas * modify blas * modify blas * modify blas * modify context --- .../kernels/metax_kernel/metax_context.cc | 18 ++++++++++++++++++ .../kernels/metax_kernel/metax_context.h | 2 ++ 2 files changed, 20 insertions(+) diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc index 6d86c81041f..efddba5f00b 100644 --- a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc +++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc @@ -15,6 +15,24 @@ #include "kernels/metax_kernel/metax_context.h" namespace phi { +const bool allow_tf32_cublas = []() -> bool { + const char* v = std::getenv("ALLOW_TF32_CUBLAS"); + if (v) { + return std::atoi(v); + } + return true; +}(); + +const bool allow_tf32_cudnn = []() -> bool { + const char* v = std::getenv("ALLOW_TF32_CUDNN"); + if (v) { + return std::atoi(v); + } + return false; +}(); + +bool AllowTF32Cublas() { return allow_tf32_cublas; } +bool AllowTF32Cudnn() { return allow_tf32_cudnn; } void DnnWorkspaceHandle::RunFuncSync( const std::function& cudnn_func, size_t required_workspace_bytes, diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h index 376981f27a4..2d761439089 100644 --- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h +++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h @@ -30,6 +30,8 @@ cublasLtHandle_t GetBlasLtHandle(); namespace phi { +bool AllowTF32Cublas(); +bool AllowTF32Cudnn(); class DnnWorkspaceHandle { public: inline DnnWorkspaceHandle(Allocator* allocator, gpuStream_t stream) From 1ae2618ac81e21e41b05797e08f1330eb504c4d5 Mon Sep 17 00:00:00 2001 From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Date: Mon, 22 Sep 2025 17:46:50 +0800 Subject: [PATCH 40/58] [Metax] update metax backend CI test (#53) * [Metax] fix dgc & mklml compile product path problem * [Metax] update metax_gpu CMakeLists.txt * [Metax] organize documents * [Metax] add log analysis script * [Metax] update metax backend CI test --- backends/metax_gpu/tests/CMakeLists.txt | 192 +++++++++++------------- backends/metax_gpu/tests/default.txt | 67 +++++++++ backends/metax_gpu/tests/run_test.sh | 56 ++++++- 3 files changed, 202 insertions(+), 113 deletions(-) create mode 100644 backends/metax_gpu/tests/default.txt diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt index 795a3c5b8ac..ded54233f24 100755 --- a/backends/metax_gpu/tests/CMakeLists.txt +++ b/backends/metax_gpu/tests/CMakeLists.txt @@ -11,117 +11,95 @@ set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test) file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py") -list( - APPEND - PYTHON_TEST_SCRIPTS - ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_where_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_split_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_fill_constant_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_empty_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_sign_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_unbind_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_put_along_axis_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_maximum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_strided_slice_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_set_value_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_flatten_contiguous_range_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_top_k_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_subtract_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_greater_equal_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_top_k_v2_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_one_hot_v2_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_fill_any_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_reshape_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_bitwise_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_pad_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_zeros_like_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_shape_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_bincount_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_assign_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_fused_bias_act_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_adamw_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_gather_nd_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_concat_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_scatter_nd_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_floordiv_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_mul_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_numel_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_scale_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_full_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py) - -list( - REMOVE_ITEM - PYTHON_TEST_SCRIPTS - # 精度问题 - ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py - # core.cudnnversion - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py - # op_test.py 里 self._get_places()接口的适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py - # device == "gpu" 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py - # paddle-gpu 报错一致 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py - # paddle.device.cuda.get_device_properties - ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py - # needs check_grad with fp64 precision - ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py - # CUDAPinnedPlace 问题 - ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py) +if(NOT TEST_LIST_FILE) + message( + STATUS + " is not set, default test list [ ${CMAKE_CURRENT_LIST_DIR}/default.txt ] will be used." + ) + file(STRINGS ${CMAKE_CURRENT_LIST_DIR}/default.txt TEST_PROGRAMS) + +else() + if(NOT EXISTS ${TEST_LIST_FILE}) + message(FATAL_ERROR " is not exist, please check it again.") + endif() + + file(STRINGS ${TEST_LIST_FILE} TEST_PROGRAMS) + + if(NOT TEST_PROGRAMS) + message(FATAL_ERROR " is empty.") + endif() + + set(PYTHON_TEST_SCRIPTS "") +endif() + +foreach(test_name ${TEST_PROGRAMS}) + set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py) + if(NOT EXISTS ${CURRENT_TEST_PROGRAM}) + message(WARNING "${CURRENT_TEST_PROGRAM} is not exist, skip it.") + else() + list(APPEND PYTHON_TEST_SCRIPTS ${CURRENT_TEST_PROGRAM}) + endif() +endforeach() list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS) + +if(NOT TEST_LIST_FILE) + list( + REMOVE_ITEM + PYTHON_TEST_SCRIPTS + # 精度问题 + ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py + # core.cudnnversion + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py + # op_test.py 里 self._get_places()接口的适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py + # device == "gpu" 适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py + # paddle-gpu 报错一致 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py + # paddle.device.cuda.get_device_properties + ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py + # needs check_grad with fp64 precision + ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py + # CUDAPinnedPlace 问题 + ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py) +endif() + +if(LOG_OUTPUT_DIR AND NOT EXISTS ${LOG_OUTPUT_DIR}) + file(MAKE_DIRECTORY ${LOG_OUTPUT_DIR}) + message(WARNING "${LOG_OUTPUT_DIR} is not exist, create it now.") +endif() + foreach(test_script ${PYTHON_TEST_SCRIPTS}) get_filename_component(test_name ${test_script} NAME_WE) - add_test( - NAME "python_${test_name}" - COMMAND ${Python_EXECUTABLE} ${test_script} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + if(LOG_OUTPUT_DIR) + set(test_log_file "${LOG_OUTPUT_DIR}/${test_name}.log") + + add_test( + NAME "python_${test_name}" + COMMAND sh -c + "${Python_EXECUTABLE} ${test_script} > ${test_log_file} 2>&1" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + + else() + add_test( + NAME "python_${test_name}" + COMMAND ${Python_EXECUTABLE} ${test_script} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 360) endforeach() diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt new file mode 100644 index 00000000000..8e2c3bcdd7e --- /dev/null +++ b/backends/metax_gpu/tests/default.txt @@ -0,0 +1,67 @@ +test_accuracy_op +test_tril_triu_op +test_where_op +test_split_op +test_fill_constant_op +test_empty_op +test_sign_op +test_cast_op +test_index_add_op +test_unbind_op +test_put_along_axis_op +test_layer_norm_op +test_maximum_op +test_accuracy_op +test_strided_slice_op +test_sum_op +test_set_value_op +test_flatten_contiguous_range_op +test_top_k_op +test_subtract_op +test_softmax_op +test_cumsum_op +test_greater_equal_op +test_elementwise_div_op +test_top_k_v2_op +test_stack_op +test_one_hot_v2_op +test_fill_any_op +test_gather_op +test_reshape_op +test_index_put_op +test_bitwise_op +test_max_op +test_pad_op +test_elementwise_pow_op +test_uniform_random_op +test_scatter_op +test_cast_op +test_zeros_like_op +test_compare_op +test_shape_op +test_tril_triu_op +test_slice_op +test_elementwise_add_op +test_index_put_op +test_bincount_op +test_assign_op +test_logical_op +test_squared_l2_norm_op +test_mean_op +test_fused_bias_act_op +test_expand_v2_op +test_adamw_op +test_gather_nd_op +test_concat_op +test_scatter_nd_op +test_elementwise_floordiv_op +test_elementwise_mul_op +test_transpose_op +test_einsum_op +test_randint_op +test_c_embedding_op +test_numel_op +test_scale_op +test_softmax_with_cross_entropy_op +test_full_op +test_scatter_op diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh index 7d1e8e072a9..b9e8ec5b5cc 100755 --- a/backends/metax_gpu/tests/run_test.sh +++ b/backends/metax_gpu/tests/run_test.sh @@ -2,13 +2,13 @@ #!/bin/bash # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,10 +29,54 @@ export rm -r build mkdir -p build && cd build -cmake .. +TEST_LOG_LEVEL=0 +TEST_LIST_FILE="" +TEST_LOG_OUTPUT_DIR="" +TEST_PARALLEL_NUM=10 -cmake --build . +while getopts "i:o:v:j:h" opt; do + case "$opt" in + i) + TEST_LIST_FILE="$OPTARG" + ;; + o) + TEST_LOG_OUTPUT_DIR="$OPTARG" + echo "Set log output dir [ $TEST_LOG_OUTPUT_DIR ]" + ;; + v) + TEST_LOG_LEVEL=$OPTARG + ;; + j) + TEST_PARALLEL_NUM="$OPTARG" + ;; + h) + echo "用法:$0 -i <测试列表文件> -o <日志输出路径> ..." + echo "选项说明:" + echo " -i 测试程序列表文件" + echo " -o 日志输出路径" + echo " -v GLOG_v 日志等级" + echo " -j ctest 测试并行数量" + echo " -h 显示帮助" + exit 0 + ;; + \?) + echo "error: unknow option '-$OPTARG'." + exit 1 + ;; + :) + echo "error option '-$OPTARG' must have parameter." + exit 1 + ;; + esac +done + + +export GLOG_v=$TEST_LOG_LEVEL -ctest -j10 --output-on-failure +cmake .. -DTEST_LIST_FILE=$TEST_LIST_FILE -DLOG_OUTPUT_DIR=$TEST_LOG_OUTPUT_DIR + +cmake --build . + +ctest -j$TEST_PARALLEL_NUM --output-on-failure From 76d5eb0245904cc209e52dd9fa92dea990db1ad7 Mon Sep 17 00:00:00 2001 From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Date: Tue, 23 Sep 2025 09:43:37 +0800 Subject: [PATCH 41/58] [Metax] fix log_analysis.py bug (#54) * [Metax] fix dgc & mklml compile product path problem * [Metax] update metax_gpu CMakeLists.txt * [Metax] organize documents * [Metax] add log analysis script * [Metax] update metax backend CI test * [Metax] fix log_analysis.py bug --- .../metax_gpu/tests/scripts/log_analysis.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/backends/metax_gpu/tests/scripts/log_analysis.py b/backends/metax_gpu/tests/scripts/log_analysis.py index c0716f5b6f5..963d50751f7 100644 --- a/backends/metax_gpu/tests/scripts/log_analysis.py +++ b/backends/metax_gpu/tests/scripts/log_analysis.py @@ -153,7 +153,6 @@ def run(self): cur_res_type = TestResult.FAILURE cur_sub_type = "other" - pre_line = None finish_early = False try: @@ -172,19 +171,19 @@ def run(self): if finish_early: break - pre_line = line if finish_early: break - if "OK" in pre_line: - cur_res_type = TestResult.OK - cur_sub_type = None - for sub_type, sub_type_params in self.__classify_data[ - cur_res_type.value - ].items(): - for rule in sub_type_params["rule"]: - if rule in line: - cur_sub_type = sub_type + if len(line) >= 2 and line[:2] == "OK": + cur_res_type = TestResult.OK + cur_sub_type = None + for sub_type, sub_type_params in self.__classify_data[ + cur_res_type.value + ].items(): + for rule in sub_type_params["rule"]: + if rule in line: + cur_sub_type = sub_type + break op_name = filename.split(".") if cur_sub_type is None: From 9c17b6e0867119ea51c1c4230603f2a34137ac68 Mon Sep 17 00:00:00 2001 From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Date: Tue, 23 Sep 2025 11:09:44 +0800 Subject: [PATCH 42/58] [Metax] update metax CI CMakeLists & scripts (#56) * [Metax] fix dgc & mklml compile product path problem * [Metax] update metax_gpu CMakeLists.txt * [Metax] organize documents * [Metax] add log analysis script * [Metax] update metax backend CI test * [Metax] fix log_analysis.py bug * [Metax] update metax CI CMakeLists & scripts --- .github/workflows/metax_work.yaml | 2 +- backends/metax_gpu/tests/CMakeLists.txt | 4 ++- backends/metax_gpu/tests/run_test.sh | 2 +- .../metax_gpu/tests/scripts/classify.json | 31 +++++++++++++++++-- 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml index 51c0c62cef6..aff530d475c 100644 --- a/.github/workflows/metax_work.yaml +++ b/.github/workflows/metax_work.yaml @@ -51,4 +51,4 @@ jobs: - name: run test run: | cd backends/metax_gpu/tests - bash run_test.sh + bash run_test.sh -j 16 diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt index ded54233f24..5b7be15e4f9 100755 --- a/backends/metax_gpu/tests/CMakeLists.txt +++ b/backends/metax_gpu/tests/CMakeLists.txt @@ -47,6 +47,8 @@ if(NOT TEST_LIST_FILE) list( REMOVE_ITEM PYTHON_TEST_SCRIPTS + # Metax unit test + ${METAX_UNIT_TEST_PATH}/test_matmul_op__metax.py # 精度问题 ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py @@ -101,5 +103,5 @@ foreach(test_script ${PYTHON_TEST_SCRIPTS}) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() - set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 360) + set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 600) endforeach() diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh index b9e8ec5b5cc..7f2277fe4fb 100755 --- a/backends/metax_gpu/tests/run_test.sh +++ b/backends/metax_gpu/tests/run_test.sh @@ -33,7 +33,7 @@ mkdir -p build && cd build TEST_LOG_LEVEL=0 TEST_LIST_FILE="" TEST_LOG_OUTPUT_DIR="" -TEST_PARALLEL_NUM=10 +TEST_PARALLEL_NUM=1 while getopts "i:o:v:j:h" opt; do case "$opt" in diff --git a/backends/metax_gpu/tests/scripts/classify.json b/backends/metax_gpu/tests/scripts/classify.json index b97255adc3d..ca92ad4a0a4 100644 --- a/backends/metax_gpu/tests/scripts/classify.json +++ b/backends/metax_gpu/tests/scripts/classify.json @@ -7,13 +7,38 @@ "FAILED":{ "precision":{ - "rule":["Mismatched elements"] + "rule":["Mismatched elements", + "RuntimeError: Jacobian mismatch for output 0 in y with respect to input 0 in x on Place(metax_gpu:0),", + "AssertionError: np.float64("] }, "api":{ - "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace", "ValueError: The API paddle.device.cuda.get_device_properties", "TypeError: paddle.index_add api"] + "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace", + "ValueError: The API paddle.device.cuda.get_device_properties", + "TypeError: paddle.index_add api", + "RuntimeError: (Unavailable) Paddle is not compiled with CUDA.", + "ValueError: invalid literal for int() with base", + "AttributeError: module 'paddle.base.libpaddle' has no attribute 'cudnn_version'", + "RuntimeError: Pinning memory is not supported for Place(metax_gpu:0)", + "PreconditionNotMetError: Context place error, excepted GPUPlace, but actually Place(metax_gpu:0).", + "AttributeError: module 'paddle.base.libpaddle.eager.ops.legacy' has no attribute 'fused_gemm_epilogue'", + "ValueError: The device should not be 'gpu', since PaddlePaddle is not compiled with CUDA"] }, "missing":{ - "rule":["missing metax_gpu kernel", "UnimplementedError: There are no kernels which are registered"] + "rule":["missing metax_gpu kernel", + "missing ONEDNN kernel", + "UnimplementedError: There are no kernels which are registered", + "symbol lookup error:", + "RuntimeError: (NotFound) The kernel"] + }, + "core_dumped":{ + "rule":["Segmentation fault"] + }, + "input_dim":{ + "rule":["ValueError: (InvalidArgument) The Input(", + "Test range of input is out of bound"] + }, + "array_dim":{ + "rule":["Arrays are not equal"] }, "file_not_found":{ "rule":["FileNotFoundError:"] From 51c98a20020ba61b2bfab54abf11668a9f40e0b6 Mon Sep 17 00:00:00 2001 From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Date: Tue, 23 Sep 2025 19:11:49 +0800 Subject: [PATCH 43/58] [Metax] fix MatmulKernel problem (#57) * [Metax] fix dgc & mklml compile product path problem * [Metax] update metax_gpu CMakeLists.txt * [Metax] organize documents * [Metax] add log analysis script * [Metax] update metax backend CI test * [Metax] fix log_analysis.py bug * [Metax] update metax CI CMakeLists & scripts * [Metax] fix MatmulKernel problem * [Metax] update metax CI program --- .../kernels/impl/matmul_kernel_impl.h | 19 +- backends/metax_gpu/tests/CMakeLists.txt | 2 +- backends/metax_gpu/tests/default.txt | 258 ++++++++++++ ...r_equal.py => test_greater_equal_metax.py} | 0 ...ild_src_rank_and_local_expert_id_metax.py} | 0 ...cubate_expand_modality_expert_id_metax.py} | 0 ....py => test_incubate_moe_combine_metax.py} | 0 ...e_dispatch_partial_nosoftmaxtopk_metax.py} | 0 ..._moe_gate_dispatch_w_permute_bwd_metax.py} | 0 ...bate_moe_gate_dispatch_w_permute_metax.py} | 0 ...layer_norm.py => test_layer_norm_metax.py} | 0 ...l_op__metax.py => test_matmul_op_metax.py} | 0 ...mpling.py => test_top_p_sampling_metax.py} | 0 .../tests/unittest/test_matmul_op__metax.py | 395 ------------------ 14 files changed, 272 insertions(+), 402 deletions(-) rename backends/metax_gpu/tests/unit_test/{test_greater_equal.py => test_greater_equal_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_incubate_build_src_rank_and_local_expert_id.py => test_incubate_build_src_rank_and_local_expert_id_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_incubate_expand_modality_expert_id.py => test_incubate_expand_modality_expert_id_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_combine.py => test_incubate_moe_combine_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py => test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_w_permute_bwd.py => test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_w_permute.py => test_incubate_moe_gate_dispatch_w_permute_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_layer_norm.py => test_layer_norm_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_matmul_op__metax.py => test_matmul_op_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_top_p_sampling.py => test_top_p_sampling_metax.py} (100%) delete mode 100644 backends/metax_gpu/tests/unittest/test_matmul_op__metax.py diff --git a/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h b/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h index bf228c81291..5221bd93ba9 100755 --- a/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h @@ -40,6 +40,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0 #include "paddle/phi/kernels/autotune/auto_tune_base.h" #endif +#include "paddle/phi/kernels/full_kernel.h" // clang-format on namespace phi { @@ -1485,16 +1486,22 @@ void MatmulKernel(const Context& ctx, bool transpose_x, bool transpose_y, DenseTensor* out) { - PADDLE_ENFORCE_NE( + if (x.numel() == 0 || y.numel() == 0) { + // input shape [1, 1, 5, 0], [1, 1, 0, 5], result shape is [1, 1, 5, 5] + phi::Full( + ctx, phi::IntArray(common::vectorize(out->dims())), 0, out); + return; + } + PADDLE_ENFORCE_GE( common::product(x.dims()), 0, - phi::errors::InvalidArgument("The Input(X) dims size must not be equal 0," - " but reviced dims size is 0. ")); - PADDLE_ENFORCE_NE( + common::errors::InvalidArgument( + "The dims of Input(X) should be greater than or equal to 0.")); + PADDLE_ENFORCE_GE( common::product(y.dims()), 0, - phi::errors::InvalidArgument("The Input(Y) dims size must not be equal 0," - " but reviced dims size is 0. ")); + common::errors::InvalidArgument( + "The dims of Input(Y) should be greater than or equal to 0.")); const std::vector x_dims = common::vectorize(x.dims()); const std::vector y_dims = common::vectorize(y.dims()); MatmulJudgeDtypeKernel( diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt index 5b7be15e4f9..e8b11d347d9 100755 --- a/backends/metax_gpu/tests/CMakeLists.txt +++ b/backends/metax_gpu/tests/CMakeLists.txt @@ -48,7 +48,7 @@ if(NOT TEST_LIST_FILE) REMOVE_ITEM PYTHON_TEST_SCRIPTS # Metax unit test - ${METAX_UNIT_TEST_PATH}/test_matmul_op__metax.py + ${METAX_UNIT_TEST_PATH}/test_matmul_op_metax.py # 精度问题 ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt index 8e2c3bcdd7e..9f073d7e92f 100644 --- a/backends/metax_gpu/tests/default.txt +++ b/backends/metax_gpu/tests/default.txt @@ -65,3 +65,261 @@ test_scale_op test_softmax_with_cross_entropy_op test_full_op test_scatter_op +test_assign_pos_op +test_index_select_compatible +test_dequantize_abs_max_op +test_fill_any_op +test_fractional_max_pool3d_api +test_nll_loss +test_is_empty_op +test_norm_nn_grad +test_index_fill +test_floor +test_slice_scatter +test_nn_matmul_v2_grad +test_matmul_op_with_head +test_broadcast_shape +test_fill_constant_op +test_decayed_adagrad_op +test_count_nonzero_api +test_tensor_fill_ +test_minimum_op +test_sigmoid_focal_loss +test_dynamic_rnn_stop_gradient +test_ops_roi_align +test_split_op +test_sum_decorator +test_share_data_op +test_assert_op +test_masked_select_op +test_tensor_fill_diagonal_tensor_ +test_unfold_op +test_scatter_add_op +test_flatten_contiguous_range_op +test_empty_like_op +test_logsumexp +test_multiply +test_ceil_op +test_nearest_interp_v2_op +test_incubate_expand_modality_expert_id +test_bmm_op +test_prelu_op +test_batch_fc_op +test_masked_fill +test_overlap_add_op +test_update_loss_scaling_op +test_floor_divide_op +test_increment +test_complex_abs +test_gather_compatible +test_functional_conv2d +test_group_norm_op_v2 +test_conv2d_transpose_op_depthwise_conv +test_diagonal_op +test_maximum_op +test_erfinv_op +test_interp_recompute_scale_factor +test_embedding_scale_grad_by_freq +test_diagonal_scatter +test_higher_dim_scatter +test_infer_shape +test_flip +test_fused_bias_dropout_residual_layer_norm_op +test_greater_equal_op +test_add_op +test_cartesian_prod +test_uniform_random_inplace_op +test_feed_fetch_method +test_pow_op +test_conv3d_transpose_op +test_add_position_encoding_op +test_imperative_data_loader_base +test_rnn_cell_api +test_linspace +test_adaptive_log_softmax_with_loss +test_cross_entropy2_op +test_complex_reshape +test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk +test_gaussian_nll_loss +test_log_normal +test_unstack_op +test_expand_as_v2_op +test_dequantize_log_op +test_complex_sum_layer +test_slice_var +test_scale_op +test_hinge_embedding_loss +test_set_value_op +test_merged_adam_op +test_index_sample_op +test_cuda_empty_cache +test_add_n_op +test_randint_like +test_unique_consecutive_op +test_fill_diagonal_tensor_op +test_log_loss_op +test_linalg_cholesky_inverse +test_numel_op +test_tril_triu_op +test_adaptive_max_pool2d +test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad +test_complex_cast +test_poisson_nll_loss +test_empty_op +test_functional_conv1d_transpose +test_clip_by_norm_op +test_box_clip_op +test_clip_op +test_grad_clip_minimize +test_less_than_op +test_adamw_op +test_data_feeder +test_top_p_sampling +test_subtract_op +test_batch_norm_op_v2 +test_cosine_embedding_loss +test_imperative_data_parallel +test_sigmoid +test_adaptive_max_pool3d +test_roll_op +test_index_put_op +test_assign_op +test_amp_check_finite_and_scale_op +test_strided_slice_op +test_label_smooth_functional +test_c_softmax_with_cross_entropy_op +test_sync_batch_norm_op_convert +test_tensor_fill_diagonal_tensor +test_bfloat16_embedding +test_gelu_op +test_full_ +test_concat_op +test_imperative_data_loader_process +test_tensor_fill_diagonal_ +test_clip_grad_norm_ +test_eager_deletion_padding_rnn +test_pool2d_api +test_clip_grad_value_ +test_isfinite_v2_op +test_nn_sigmoid_op +test_adaptive_avg_pool2d +test_size +test_sigmoid_cross_entropy_with_logits_op +test_scatter_reduce_op +test_rsqrt +test_conv2d_transpose_layer +test_scatter_compatible +test_scatter_nd_op +test_add_op_fluid +test_unique +test_compat_split_static +test_stack_op +test_tile_op +test_adam_optimizer_fp32_fp64 +test_batch_norm_op +test_gather_nd_op +test_pow +test_executor_check_fetch_list +test_inplace_softmax_with_cross_entropy +test_cos +test_imperative_parallel_coalesce_split +test_grid_sample_function +test_rnn_decode_api +test_triu_indices_op +test_binary_cross_entropy_with_logits_op +test_mean_op_v1 +test_round_op +test_assign_pos_op_dygraph +test_nn_functional_embedding_static +test_norm_op +test_unbind_op +test_bilinear_interp_v2_op +test_tensor_data_ptr +test_norm_all +test_conv1d_transpose_layer +test_arange +test_compat_unfold +test_fetch_var +test_index_select_op +test_sign_op +test_functional_conv3d_transpose +test_uniform_random_bf16_op +test_gather_tree_op +test_histogram_bin_edges_op +test_fractional_max_pool2d_api +test_fill_any_like_op +test_alpha_dropout +test_conv3d_layer +test_compat_pad +test_box_coder_op +test_full_op +test_repeat_interleave_op +test_reshape_op +test_embedding_renorm +test_log_softmax +test_pad3d_op +test_diag_v2 +test_complex_transpose +test_prior_box_op +test_square_error_cost +test_fused_rotary_position_embedding +test_gru_rnn_op +test_restrict_nonzero +test_dygraph_weight_norm +test_conv_transpose_nn_grad +test_incubate_build_src_rank_and_local_expert_id +test_elementwise_nn_grad +test_fused_bias_dropout_residual_layer_norm_op_api +test_simple_rnn_op +test_data_generator +test_compat_split +test_scatter_add_inplace_op +test_c_softmax_with_multi_label_cross_entropy_op +test_conv3d_transpose_layer +test_less_equal_op +test_gumbel_softmax_op +test_assign_value_op +test_cast_op +test_fused_bias_act_op +test_conv3d_transpose_part2_op +test_log +test_data +test_incubate_moe_combine +test_masked_scatter +test_silu_op +test_select_scatter_op +test_adagrad_op_v2 +test_functional_conv3d +test_bce_with_logits_loss +test_argsort_op +test_layer_norm_op_v2 +test_adaptive_max_pool1d +test_shard_index_op +test_cuda_max_memory_allocated +test_roi_align_op +test_sin +test_take +test_take_along_dim +test_complex_matmul +test_reduce_as_op +test_log_normal_inplace +test_repeat +test_fetch_lod_tensor_array +test_partial_concat_op +test_accuracy_op +test_l1_norm_op +test_bce_loss +test_fused_conv2d_add_act_op +test_tril_indices_op +test_cross_entropy_op +test_blha_get_max_len_op +test_softmax_mask_fuse_op +test_diag_embed +test_one_hot_v2_op +test_selu_op +test_huber_loss_op +test_einsum_op +test_dygraph_spectral_norm +test_block_diag +test_index_elementwise +test_matmul_out diff --git a/backends/metax_gpu/tests/unit_test/test_greater_equal.py b/backends/metax_gpu/tests/unit_test/test_greater_equal_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_greater_equal.py rename to backends/metax_gpu/tests/unit_test/test_greater_equal_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py rename to backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py rename to backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_combine_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_layer_norm.py b/backends/metax_gpu/tests/unit_test/test_layer_norm_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_layer_norm.py rename to backends/metax_gpu/tests/unit_test/test_layer_norm_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py b/backends/metax_gpu/tests/unit_test/test_matmul_op_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py rename to backends/metax_gpu/tests/unit_test/test_matmul_op_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py b/backends/metax_gpu/tests/unit_test/test_top_p_sampling_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_top_p_sampling.py rename to backends/metax_gpu/tests/unit_test/test_top_p_sampling_metax.py diff --git a/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py b/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py deleted file mode 100644 index 7545e16d14d..00000000000 --- a/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py +++ /dev/null @@ -1,395 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import numpy as np -import unittest -from tests.op_test import OpTest -import paddle - -paddle.enable_static() -SEED = 2022 - - -def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0): - """Reference forward implementation using np.matmul.""" - # np.matmul does not support the transpose flags, so we manually - # transpose X and Y appropriately. - if transpose_X: - if X.ndim == 1: - X = X.reshape((X.size,)) - elif X.ndim == 2: - X = X.T - else: - dim = [i for i in range(len(X.shape))] - dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1] - X = np.transpose(X, tuple(dim)) - if transpose_Y: - if Y.ndim == 1: - Y = Y.reshape((Y.size,)) - else: - dim = [i for i in range(len(Y.shape))] - dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1] - Y = np.transpose(Y, tuple(dim)) - - Out = np.matmul(X, Y) - if abs(scale - 1.0) > 1e-09: - Out = Out * scale - return Out - - -class TestBmmOp(OpTest): - """ - case 0 - """ - - def set_metax_gpu(self): - self.__class__.use_custom_device = True - self.place = paddle.CustomPlace("metax_gpu", 0) - - def config(self): - self.x_shape = (10, 2, 5) - self.y_shape = (10, 5, 8) - - def init_kernel_type(self): - self.dtype = "float32" - - def setUp(self): - self.set_metax_gpu() - self.init_kernel_type() - self.config() - self.op_type = "bmm" - x = np.random.random(self.x_shape).astype(self.dtype) - y = np.random.random(self.y_shape).astype(self.dtype) - # -0.1 ~ 0.1 - x = -0.1 + 0.2 * x - y = -0.1 + 0.2 * y - result = reference_matmul(x, y) - result = result.astype(self.dtype) - self.inputs = { - "X": x, - "Y": y, - } - self.outputs = {"Out": result} - - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-3) - - def test_check_grad(self): - self.check_grad_with_place(self.place, ["X", "Y"], "Out") - - -class TestBmmOp1(TestBmmOp): - """ - case 1 - """ - - def config(self): - self.x_shape = (40, 10, 10) - self.y_shape = (40, 10, 10) - - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-3) - - def test_check_grad(self): - self.check_grad_with_place(self.place, ["X", "Y"], "Out") - - -class TestBmmOp2(TestBmmOp): - """ - case 2 - """ - - def config(self): - self.x_shape = (4, 10, 80) - self.y_shape = (4, 80, 1) - - def test_check_grad(self): - self.check_grad_with_place( - self.place, - ["X", "Y"], - "Out", - max_relative_error=1e-2, - ) - - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-3) - - -class TestMatMulOp(OpTest): - """ - basic case - """ - - def setUp(self): - self.set_metax_gpu() - self.op_type = "matmul_v2" - self.init_dtype() - self.init_alpha() - self.config() - - X = np.random.random(self.x_shape).astype(self.dtype) - Y = np.random.random(self.y_shape).astype(self.dtype) - # -0.1 ~ 0.1 - X = -0.1 + 0.2 * X - Y = -0.1 + 0.2 * Y - Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y, self.alpha) - Out = Out.astype(self.dtype) - self.inputs = {"X": X, "Y": Y} - self.attrs = { - "trans_x": self.transpose_X, - "trans_y": self.transpose_Y, - "alpha": self.alpha, - } - self.outputs = {"Out": Out} - - def set_metax_gpu(self): - self.__class__.use_custom_device = True - self.place = paddle.CustomPlace("metax_gpu", 0) - - def config(self): - self.x_shape = (100,) - self.y_shape = (100,) - self.transpose_X = False - self.transpose_Y = False - - def init_alpha(self): - self.alpha = 1.0 - - def init_dtype(self): - self.dtype = "float32" - - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-7) - - def test_check_grad_normal(self): - self.check_grad_with_place(self.place, ["X", "Y"], "Out") - - -class TestMatMulOp1(TestMatMulOp): - """ - case x_ndim == 1, y_ndim != 1 - """ - - def config(self): - self.x_shape = (100,) - self.y_shape = (1, 3, 2, 100) - self.transpose_X = False - self.transpose_Y = True - - -class TestMatMulOp2(TestMatMulOp): - """ - case x_ndim != 1, y_ndim == 1 - """ - - def config(self): - self.x_shape = (1, 2, 100, 1) - self.y_shape = (100,) - self.transpose_X = True - self.transpose_Y = False - - -class TestMatMulOp3(TestMatMulOp): - """ - case [M, K] x [K, N] = [M, N] - """ - - def config(self): - self.x_shape = (2, 100) - self.y_shape = (100, 2) - self.transpose_X = False - self.transpose_Y = False - - -class TestMatMulOp4(TestMatMulOp): - """ - case [M, K] x [K, N] = [M, N] - """ - - def config(self): - self.x_shape = (2, 100) - self.y_shape = (2, 100) - self.transpose_X = False - self.transpose_Y = True - - -class TestMatMulOp5(TestMatMulOp): - """ - case [M, K] x [K, N] = [M, N] - """ - - def config(self): - self.x_shape = (100, 2) - self.y_shape = (100, 2) - self.transpose_X = True - self.transpose_Y = False - - -class TestMatMulOp6(TestMatMulOp): - """ - case [B, M, K] x [K, N] = [B, M, N] - """ - - def config(self): - self.x_shape = (2, 2, 25) - self.y_shape = (25, 4) - self.transpose_X = False - self.transpose_Y = False - - -class TestMatMulOp7(TestMatMulOp): - """ - case [B, M, K] x [K, N] = [B, M, N] - """ - - def config(self): - self.x_shape = (1, 4, 25) - self.y_shape = (4, 25) - self.transpose_X = False - self.transpose_Y = True - - -class TestMatMulOp8(TestMatMulOp): - """ - case [B, M, K] x [K, N] = [B, M, N] - """ - - def config(self): - self.x_shape = (1, 25, 4) - self.y_shape = (25, 4) - self.transpose_X = True - self.transpose_Y = False - - -class TestMatMulOp9(TestMatMulOp): - """ - case [B, M, K] x [B, K, N] = [B, M, N] - """ - - def config(self): - self.x_shape = (2, 5, 10) - self.y_shape = (2, 10, 5) - self.transpose_X = False - self.transpose_Y = False - - -class TestMatMulOp10(TestMatMulOp): - """ - case [B, M, K] x [B, K, N] = [B, M, N] - """ - - def config(self): - self.x_shape = (2, 10, 5) - self.y_shape = (2, 10, 5) - self.transpose_X = True - self.transpose_Y = False - - -class TestMatMulOp11(TestMatMulOp): - """ - case [B, M, K] x [B, K, N] = [B, M, N] - """ - - def config(self): - self.x_shape = (2, 5, 10) - self.y_shape = (2, 5, 10) - self.transpose_X = False - self.transpose_Y = True - - -class TestMatMulOp12(TestMatMulOp): - """ - case to check the gradient for special case - """ - - def config(self): - self.x_shape = 100 - self.y_shape = (1, 2, 2, 100, 2) - self.transpose_X = False - self.transpose_Y = False - - -class TestMatMulOp13(TestMatMulOp): - """ - case to check the gradient for special case - """ - - def config(self): - self.x_shape = (2, 1, 100) - self.y_shape = 100 - self.transpose_X = False - self.transpose_Y = False - - -# TODO(metax_gpu): alpha will be supported in next version -# --------------------test matmul alpha-------------------- -# def create_test_alpha_class(parent): -# class TestMatMulOpAlphaCase(parent): -# def init_alpha(self): -# self.alpha = 0.125 - -# cls_name = "{0}_{1}".format(parent.__name__, "Alpha") -# TestMatMulOpAlphaCase.__name__ = cls_name -# globals()[cls_name] = TestMatMulOpAlphaCase - -# create_test_alpha_class(TestMatMulOp) -# create_test_alpha_class(TestMatMulOp1) -# create_test_alpha_class(TestMatMulOp2) -# create_test_alpha_class(TestMatMulOp3) -# create_test_alpha_class(TestMatMulOp4) -# create_test_alpha_class(TestMatMulOp5) -# create_test_alpha_class(TestMatMulOp6) -# create_test_alpha_class(TestMatMulOp9) -# create_test_alpha_class(TestMatMulOp10) -# create_test_alpha_class(TestMatMulOp11) -# create_test_alpha_class(TestMatMulOp12) -# create_test_alpha_class(TestMatMulOp13) - - -# --------------------test matmul fp16-------------------- -def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5): - class TestMatMulOpFp16Case(parent): - def init_kernel_type(self): - self.dtype = np.float16 - - def test_check_output(self): - self.check_output_with_place(self.place, atol=atol) - - def test_check_grad(self): - self.check_grad_with_place( - self.place, ["X", "Y"], "Out", max_relative_error=max_relative_error - ) - - cls_name = "{0}_{1}".format(parent.__name__, "Fp16") - TestMatMulOpFp16Case.__name__ = cls_name - globals()[cls_name] = TestMatMulOpFp16Case - - -create_test_fp16_class(TestMatMulOp) -create_test_fp16_class(TestMatMulOp1) -create_test_fp16_class(TestMatMulOp2) -create_test_fp16_class(TestMatMulOp3) -create_test_fp16_class(TestMatMulOp4) -create_test_fp16_class(TestMatMulOp5) -create_test_fp16_class(TestMatMulOp6) -create_test_fp16_class(TestMatMulOp9) -create_test_fp16_class(TestMatMulOp10) -create_test_fp16_class(TestMatMulOp11) -create_test_fp16_class(TestMatMulOp12) -create_test_fp16_class(TestMatMulOp13) - -if __name__ == "__main__": - unittest.main() From d113018e9befab1540aa21ee5d6f8261831e245d Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Tue, 23 Sep 2025 19:12:06 +0800 Subject: [PATCH 44/58] [metax]fix paddle bug" (#58) * [metax]fix paddle bug --- backends/metax_gpu/CMakeLists.txt | 2 - .../grid_sample_grad_kernel_register.cu | 23 - .../grid_sample_kernel_register.cu | 19 - .../grid_sample_grad_kernel_register.cu | 839 ++++++++++++++++++ .../grid_sample_kernel_register.cu | 527 +++++++++++ .../metax_kernel/weight_only_linear_kernel.cu | 3 +- 6 files changed, 1368 insertions(+), 45 deletions(-) delete mode 100644 backends/metax_gpu/kernels/cuda_kernels/grid_sample_grad_kernel_register.cu delete mode 100644 backends/metax_gpu/kernels/cuda_kernels/grid_sample_kernel_register.cu create mode 100644 backends/metax_gpu/kernels/metax_kernel/grid_sample_grad_kernel_register.cu create mode 100644 backends/metax_gpu/kernels/metax_kernel/grid_sample_kernel_register.cu diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index b98f2bcc919..bca1ce7aad4 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -310,8 +310,6 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/hinge_loss_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/hinge_loss_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gru_grad_kernel.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/grid_sample_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/generate_proposals_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gaussian_inplace_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gammaln_kernel.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/grid_sample_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/grid_sample_grad_kernel_register.cu deleted file mode 100644 index 83c47dc86db..00000000000 --- a/backends/metax_gpu/kernels/cuda_kernels/grid_sample_grad_kernel_register.cu +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/grid_sample_grad_kernel.h" - -PD_CUSTOM_KERNEL_REGISTER(grid_sample_grad, - metax_gpu, - ALL_LAYOUT, - phi::GridSampleGradKernel, - float, - double) {} diff --git a/backends/metax_gpu/kernels/cuda_kernels/grid_sample_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/grid_sample_kernel_register.cu deleted file mode 100644 index a0447405971..00000000000 --- a/backends/metax_gpu/kernels/cuda_kernels/grid_sample_kernel_register.cu +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/grid_sample_kernel.h" - -PD_CUSTOM_KERNEL_REGISTER( - grid_sample, metax_gpu, ALL_LAYOUT, phi::GridSampleKernel, float, double) {} diff --git a/backends/metax_gpu/kernels/metax_kernel/grid_sample_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/grid_sample_grad_kernel_register.cu new file mode 100644 index 00000000000..8aae95bdb22 --- /dev/null +++ b/backends/metax_gpu/kernels/metax_kernel/grid_sample_grad_kernel_register.cu @@ -0,0 +1,839 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "kernels/metax_kernel/metax_context.h" +#include "paddle/phi/backends/dynload/cudnn.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpu/grid_sample_utils.h" +#include "paddle/phi/kernels/grid_sample_grad_kernel.h" + +namespace phi { + +template +static __forceinline__ __device__ void AtomicAdd(T* data, + IndexT h, + IndexT w, + IndexT sH, + IndexT sW, + IndexT H, + IndexT W, + T delta) { + if (InBounds(h, w, H, W)) { + phi::CudaAtomicAdd(data + h * sH + w * sW, delta); + } +} + +template +static __forceinline__ __device__ void AtomicAdd3D(T* data, + IndexT d, + IndexT h, + IndexT w, + IndexT sD, + IndexT sH, + IndexT sW, + IndexT D, + IndexT H, + IndexT W, + T delta) { + if (InBounds3D(d, h, w, D, H, W)) { + phi::CudaAtomicAdd(data + d * sD + h * sH + w * sW, delta); + } +} + +template +static __forceinline__ __device__ T +UnnormalizeWithMask(T coord, IndexT size, bool align_corners, T* grad_in) { + if (align_corners) { + *grad_in = static_cast(size - 1) / 2; + return ((coord + 1.f) / 2) * (size - 1); + } else { + *grad_in = static_cast(size) / 2; + return ((coord + 1.f) * size - 1) / 2; + } +} + +template +static __forceinline__ __device__ T ClipIndexesWithMask(T in, + IndexT clip_limit, + T* grad_in) { + if (in <= static_cast(0)) { + *grad_in = static_cast(0); + return static_cast(0); + } else { + T max = static_cast(clip_limit - 1); + if (in >= max) { + *grad_in = static_cast(0); + return max; + } else { + *grad_in = static_cast(1); + return in; + } + } +} + +template +static __forceinline__ __device__ T +ReflectIndexesWithMask(T in, IndexT twice_low, IndexT twice_high, T* grad_in) { + if (twice_low == twice_high) { + *grad_in = static_cast(0); + return static_cast(0); + } + IndexT grad_in_mult_; + T min = static_cast(twice_low) / 2; + T span = static_cast(twice_high - twice_low) / 2; + in = in - min; + if (in < static_cast(0)) { + grad_in_mult_ = -1; + in = -in; + } else { + grad_in_mult_ = 1; + } + T extra = fmod(in, span); + IndexT flips = static_cast(floor(in / span)); + if (flips % 2 == 0) { + *grad_in = static_cast(grad_in_mult_); + return extra + min; + } else { + *grad_in = static_cast(-grad_in_mult_); + return span - extra + min; + } +} + +template +static __forceinline__ __device__ T +ComputePositionsWithMask(T coord, + IndexT size, + PaddingMode padding_mode, + bool align_corners, + T* grad_in) { + T grad_clip, grad_refl; + coord = UnnormalizeWithMask(coord, size, align_corners, grad_in); + if (padding_mode == PaddingMode::border) { + coord = ClipIndexesWithMask(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_clip; + } else if (padding_mode == PaddingMode::reflect) { + coord = align_corners ? ReflectIndexesWithMask( + coord, 0, 2 * (size - 1), &grad_refl) + : ReflectIndexesWithMask( + coord, -1, 2 * size - 1, &grad_refl); + coord = ClipIndexesWithMask(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_refl * grad_clip; + } + return SafeDownGradeToIntRange(coord); +} + +template +__global__ void GridSamplerCudaBackwardKernel(const IndexT nthreads, + const T* grad_output, + const T* input, + const T* grid, + IndexT n, + IndexT out_c, + IndexT out_h, + IndexT out_w, + IndexT in_h, + IndexT in_w, + T* grad_input, + T* grad_grid, + const Mode mode, + const PaddingMode padding_mode, + bool align_corners) { + IndexT inp_sN = out_c * in_h * in_w; + IndexT inp_sC = in_h * in_w; + IndexT inp_sH = in_w; + IndexT inp_sW = 1; + IndexT grid_sN = out_h * out_w * 2; + IndexT grid_sH = out_w * 2; + IndexT grid_sW = 2; + IndexT grid_sCoor = 1; + + IndexT gOut_sN = out_c * out_h * out_w; + IndexT gOut_sC = out_h * out_w; + IndexT gOut_sH = out_w; + IndexT gOut_sW = 1; + + CUDA_KERNEL_LOOP(index, nthreads) { + const IndexT w = index % out_w; + const IndexT h = (index / out_w) % out_h; + const IndexT n = index / (out_h * out_w); + const IndexT grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; + + T ix = grid[grid_offset]; + T iy = grid[grid_offset + grid_sCoor]; + + T gix_mult, giy_mult; + ix = ComputePositionsWithMask( + ix, in_w, padding_mode, align_corners, &gix_mult); + iy = ComputePositionsWithMask( + iy, in_h, padding_mode, align_corners, &giy_mult); + + if (mode == Mode::bilinear) { + IndexT ix_nw = static_cast(floor(ix)); + IndexT iy_nw = static_cast(floor(iy)); + IndexT ix_ne = ix_nw + 1; + IndexT iy_ne = iy_nw; + IndexT ix_sw = ix_nw; + IndexT iy_sw = iy_nw + 1; + IndexT ix_se = ix_nw + 1; + IndexT iy_se = iy_nw + 1; + + T nw = (ix_se - ix) * (iy_se - iy); + T ne = (ix - ix_sw) * (iy_sw - iy); + T sw = (ix_ne - ix) * (iy - iy_ne); + T se = (ix - ix_nw) * (iy - iy_nw); + + T gix = static_cast(0), giy = static_cast(0); + IndexT gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW; + T* gInp_ptr_NC = grad_input + n * inp_sN; + IndexT inp_offset_NC = n * inp_sN; + for (IndexT c = 0; c < out_c; ++c, + inp_offset_NC += inp_sC, + gInp_ptr_NC += inp_sC, + gOut_offset += gOut_sC) { + T gOut = grad_output[gOut_offset]; + + AtomicAdd( + gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w, nw * gOut); + AtomicAdd( + gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w, ne * gOut); + AtomicAdd( + gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w, sw * gOut); + AtomicAdd( + gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w, se * gOut); + + if (InBounds(iy_nw, ix_nw, in_h, in_w)) { + T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW]; + gix -= nw_val * (iy_se - iy) * gOut; + giy -= nw_val * (ix_se - ix) * gOut; + } + if (InBounds(iy_ne, ix_ne, in_h, in_w)) { + T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW]; + gix += ne_val * (iy_sw - iy) * gOut; + giy -= ne_val * (ix - ix_sw) * gOut; + } + if (InBounds(iy_sw, ix_sw, in_h, in_w)) { + T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW]; + gix -= sw_val * (iy - iy_ne) * gOut; + giy += sw_val * (ix_ne - ix) * gOut; + } + if (InBounds(iy_se, ix_se, in_h, in_w)) { + T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW]; + gix += se_val * (iy - iy_nw) * gOut; + giy += se_val * (ix - ix_nw) * gOut; + } + } + + if (grad_grid != nullptr) { + T* gGrid_ptr_NHW = grad_grid + index * grid_sW; + gGrid_ptr_NHW[0] = gix_mult * gix; + gGrid_ptr_NHW[1] = giy_mult * giy; + } + } else if (mode == Mode::nearest) { + IndexT ix_nearest = static_cast(std::nearbyint(ix)); + IndexT iy_nearest = static_cast(std::nearbyint(iy)); + + IndexT gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW; + T* gInp_ptr_NC = grad_input + n * inp_sN; + for (IndexT c = 0; c < out_c; + ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) { + AtomicAdd(gInp_ptr_NC, + iy_nearest, + ix_nearest, + inp_sH, + inp_sW, + in_h, + in_w, + grad_output[gOut_offset]); + } + + if (grad_grid != nullptr) { + T* gGrid_ptr_NHW = grad_grid + index * grid_sW; + gGrid_ptr_NHW[0] = static_cast(0); + gGrid_ptr_NHW[1] = static_cast(0); + } + } + } +} + +template +__global__ void GridSampler3DCudaBackwardKernel(const IndexT nthreads, + const T* grad_output, + const T* input, + const T* grid, + IndexT out_c, + IndexT out_d, + IndexT out_h, + IndexT out_w, + IndexT in_d, + IndexT in_h, + IndexT in_w, + T* grad_input, + T* grad_grid, + const Mode mode, + const PaddingMode padding_mode, + bool align_corners) { + IndexT inp_sW = 1; + IndexT inp_sH = in_w; + IndexT inp_sD = in_h * in_w; + IndexT inp_sC = in_d * inp_sD; + IndexT inp_sN = out_c * inp_sC; + + IndexT grid_sCoor = 1; + IndexT grid_sW = 3; + IndexT grid_sH = out_w * grid_sW; + IndexT grid_sD = out_h * grid_sH; + IndexT grid_sN = out_d * grid_sD; + + IndexT gOut_sW = 1; + IndexT gOut_sH = out_w; + IndexT gOut_sD = out_h * out_w; + IndexT gOut_sC = out_d * gOut_sD; + IndexT gOut_sN = out_c * gOut_sC; + + CUDA_KERNEL_LOOP_TYPE(index, nthreads, IndexT) { + const IndexT w = index % out_w; + const IndexT h = (index / out_w) % out_h; + const IndexT d = (index / (out_h * out_w)) % out_d; + const IndexT n = index / (out_d * out_h * out_w); + const auto grid_offset = + n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW; + + // get the corresponding input x, y, z coordinates from grid + T ix = grid[grid_offset]; + T iy = grid[grid_offset + grid_sCoor]; + T iz = grid[grid_offset + 2 * grid_sCoor]; + + // multipliers for gradients on ix, iy, and iz + T gix_mult, giy_mult, giz_mult; + ix = ComputePositionsWithMask( + ix, in_w, padding_mode, align_corners, &gix_mult); + iy = ComputePositionsWithMask( + iy, in_h, padding_mode, align_corners, &giy_mult); + iz = ComputePositionsWithMask( + iz, in_d, padding_mode, align_corners, &giz_mult); + + if (mode == Mode::bilinear) { + // get corner pixel values from (x, y, z) + // for 4d, we used north-east-south-west + // for 5d, we add top-bottom + IndexT ix_tnw = static_cast(std::floor(ix)); + IndexT iy_tnw = static_cast(std::floor(iy)); + IndexT iz_tnw = static_cast(std::floor(iz)); + + IndexT ix_tne = ix_tnw + 1; + IndexT iy_tne = iy_tnw; + IndexT iz_tne = iz_tnw; + + IndexT ix_tsw = ix_tnw; + IndexT iy_tsw = iy_tnw + 1; + IndexT iz_tsw = iz_tnw; + + IndexT ix_tse = ix_tnw + 1; + IndexT iy_tse = iy_tnw + 1; + IndexT iz_tse = iz_tnw; + + IndexT ix_bnw = ix_tnw; + IndexT iy_bnw = iy_tnw; + IndexT iz_bnw = iz_tnw + 1; + + IndexT ix_bne = ix_tnw + 1; + IndexT iy_bne = iy_tnw; + IndexT iz_bne = iz_tnw + 1; + + IndexT ix_bsw = ix_tnw; + IndexT iy_bsw = iy_tnw + 1; + IndexT iz_bsw = iz_tnw + 1; + + IndexT ix_bse = ix_tnw + 1; + IndexT iy_bse = iy_tnw + 1; + IndexT iz_bse = iz_tnw + 1; + + // get surfaces to each neighbor: + T tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); + T tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); + T tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); + T tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); + T bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); + T bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); + T bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); + T bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); + + T gix = static_cast(0), giy = static_cast(0), + giz = static_cast(0); + IndexT gOut_offset = + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; + IndexT inp_offset_NC = n * inp_sN; + T* gInp_ptr_NC = grad_input + n * inp_sN; + for (IndexT c = 0; c < out_c; ++c, + gOut_offset += gOut_sC, + gInp_ptr_NC += inp_sC, + inp_offset_NC += inp_sC) { + T gOut = grad_output[gOut_offset]; + + AtomicAdd3D(gInp_ptr_NC, + iz_tnw, + iy_tnw, + ix_tnw, + inp_sD, + inp_sH, + inp_sW, + in_d, + in_h, + in_w, + tnw * gOut); + AtomicAdd3D(gInp_ptr_NC, + iz_tne, + iy_tne, + ix_tne, + inp_sD, + inp_sH, + inp_sW, + in_d, + in_h, + in_w, + tne * gOut); + AtomicAdd3D(gInp_ptr_NC, + iz_tsw, + iy_tsw, + ix_tsw, + inp_sD, + inp_sH, + inp_sW, + in_d, + in_h, + in_w, + tsw * gOut); + AtomicAdd3D(gInp_ptr_NC, + iz_tse, + iy_tse, + ix_tse, + inp_sD, + inp_sH, + inp_sW, + in_d, + in_h, + in_w, + tse * gOut); + AtomicAdd3D(gInp_ptr_NC, + iz_bnw, + iy_bnw, + ix_bnw, + inp_sD, + inp_sH, + inp_sW, + in_d, + in_h, + in_w, + bnw * gOut); + AtomicAdd3D(gInp_ptr_NC, + iz_bne, + iy_bne, + ix_bne, + inp_sD, + inp_sH, + inp_sW, + in_d, + in_h, + in_w, + bne * gOut); + AtomicAdd3D(gInp_ptr_NC, + iz_bsw, + iy_bsw, + ix_bsw, + inp_sD, + inp_sH, + inp_sW, + in_d, + in_h, + in_w, + bsw * gOut); + AtomicAdd3D(gInp_ptr_NC, + iz_bse, + iy_bse, + ix_bse, + inp_sD, + inp_sH, + inp_sW, + in_d, + in_h, + in_w, + bse * gOut); + + // calculate grad_grid + if (InBounds3D(iz_tnw, iy_tnw, ix_tnw, in_d, in_h, in_w)) { + T tnw_val = input[inp_offset_NC + iz_tnw * inp_sD + iy_tnw * inp_sH + + ix_tnw * inp_sW]; + gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gOut; + giy -= tnw_val * (ix_bse - ix) * (iz_bse - iz) * gOut; + giz -= tnw_val * (ix_bse - ix) * (iy_bse - iy) * gOut; + } + if (InBounds3D(iz_tne, iy_tne, ix_tne, in_d, in_h, in_w)) { + T tne_val = input[inp_offset_NC + iz_tne * inp_sD + iy_tne * inp_sH + + ix_tne * inp_sW]; + gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gOut; + giy -= tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gOut; + giz -= tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gOut; + } + if (InBounds3D(iz_tsw, iy_tsw, ix_tsw, in_d, in_h, in_w)) { + T tsw_val = input[inp_offset_NC + iz_tsw * inp_sD + iy_tsw * inp_sH + + ix_tsw * inp_sW]; + gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gOut; + giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gOut; + giz -= tsw_val * (ix_bne - ix) * (iy - iy_bne) * gOut; + } + if (InBounds3D(iz_tse, iy_tse, ix_tse, in_d, in_h, in_w)) { + T tse_val = input[inp_offset_NC + iz_tse * inp_sD + iy_tse * inp_sH + + ix_tse * inp_sW]; + gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gOut; + giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gOut; + giz -= tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gOut; + } + if (InBounds3D(iz_bnw, iy_bnw, ix_bnw, in_d, in_h, in_w)) { + T bnw_val = input[inp_offset_NC + iz_bnw * inp_sD + iy_bnw * inp_sH + + ix_bnw * inp_sW]; + gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gOut; + giy -= bnw_val * (ix_tse - ix) * (iz - iz_tse) * gOut; + giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gOut; + } + if (InBounds3D(iz_bne, iy_bne, ix_bne, in_d, in_h, in_w)) { + T bne_val = input[inp_offset_NC + iz_bne * inp_sD + iy_bne * inp_sH + + ix_bne * inp_sW]; + gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gOut; + giy -= bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gOut; + giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gOut; + } + if (InBounds3D(iz_bsw, iy_bsw, ix_bsw, in_d, in_h, in_w)) { + T bsw_val = input[inp_offset_NC + iz_bsw * inp_sD + iy_bsw * inp_sH + + ix_bsw * inp_sW]; + gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gOut; + giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gOut; + giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gOut; + } + if (InBounds3D(iz_bse, iy_bse, ix_bse, in_d, in_h, in_w)) { + T bse_val = input[inp_offset_NC + iz_bse * inp_sD + iy_bse * inp_sH + + ix_bse * inp_sW]; + gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gOut; + giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gOut; + giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gOut; + } + } + if (grad_grid != nullptr) { + T* gGrid_ptr_NDHW = grad_grid + index * grid_sW; + gGrid_ptr_NDHW[0] = gix_mult * gix; + gGrid_ptr_NDHW[1] = giy_mult * giy; + gGrid_ptr_NDHW[2] = giz_mult * giz; + } + } else if (mode == Mode::nearest) { + IndexT ix_nearest = static_cast(std::round(ix)); + IndexT iy_nearest = static_cast(std::round(iy)); + IndexT iz_nearest = static_cast(std::round(iz)); + + // assign nearest neighbor pixel value to output pixel + IndexT gOut_offset = + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; + T* gInp_ptr_NC = grad_input + n * inp_sN; + for (IndexT c = 0; c < out_c; + ++c, gOut_offset += gOut_sC, gInp_ptr_NC += inp_sC) { + AtomicAdd3D(gInp_ptr_NC, + iz_nearest, + iy_nearest, + ix_nearest, + inp_sD, + inp_sH, + inp_sW, + in_d, + in_h, + in_w, + grad_output[gOut_offset]); + } + if (grad_grid != nullptr) { + T* gGrid_ptr_NDHW = grad_grid + index * grid_sW; + gGrid_ptr_NDHW[0] = static_cast(0); + gGrid_ptr_NDHW[1] = static_cast(0); + gGrid_ptr_NDHW[2] = static_cast(0); + } + } + } +} + +template +void GridSampleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& grid, + const DenseTensor& out_grad, + const std::string& mode, + const std::string& padding_mode, + bool align_corners, + DenseTensor* x_grad, + DenseTensor* grid_grad) { + if (out_grad.numel() == 0) { + if (x_grad) { + phi::Full( + dev_ctx, phi::IntArray(common::vectorize(x_grad->dims())), 0, x_grad); + } + if (grid_grad) { + phi::Full(dev_ctx, + phi::IntArray(common::vectorize(grid_grad->dims())), + 0, + grid_grad); + } + return; + } + + PaddingMode enum_padding_mode; + Mode enum_mode; + if (padding_mode == "border") { + enum_padding_mode = PaddingMode::border; + } else if (padding_mode == "reflection") { + enum_padding_mode = PaddingMode::reflect; + } else { + enum_padding_mode = PaddingMode::zeros; + } + + if (mode == "nearest") { + enum_mode = Mode::nearest; + } else { + enum_mode = Mode::bilinear; + } + +#ifndef PADDLE_WITH_HIP + if (condCudnnGridSampler(x, grid) && + enum_padding_mode == PaddingMode::zeros && enum_mode == Mode::bilinear && + align_corners) { + const int64_t N = x.dims()[0]; + const int64_t C = x.dims()[1]; + const int64_t H_in = x.dims()[2]; + const int64_t W_in = x.dims()[3]; + const int64_t H_out = grid.dims()[1]; + const int64_t W_out = grid.dims()[2]; + + // cuDNN handle + cudnnHandle_t handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); + + // Create and set Tensor descriptors (NCHW) for x/y + cudnnTensorDescriptor_t x_desc, dx_desc, y_desc; + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnCreateTensorDescriptor(&x_desc)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnCreateTensorDescriptor(&dx_desc)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnCreateTensorDescriptor(&y_desc)); + + const cudnnDataType_t cudnn_dtype = + std::is_same::value ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE; + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnSetTensor4dDescriptor(x_desc, + CUDNN_TENSOR_NCHW, + cudnn_dtype, + static_cast(N), + static_cast(C), + static_cast(H_in), + static_cast(W_in))); + + // The shape of dx is consistent with that of x + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnSetTensor4dDescriptor(dx_desc, + CUDNN_TENSOR_NCHW, + cudnn_dtype, + static_cast(N), + static_cast(C), + static_cast(H_in), + static_cast(W_in))); + + // The shape of y is consistent with out_grad + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnSetTensor4dDescriptor(y_desc, + CUDNN_TENSOR_NCHW, + cudnn_dtype, + static_cast(N), + static_cast(C), + static_cast(H_out), + static_cast(W_out))); + + // Spatial Transformer descriptor: specifies sampler type and output + // dimension (N, C, H_out, W_out) + cudnnSpatialTransformerDescriptor_t st_desc; + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnCreateSpatialTransformerDescriptor(&st_desc)); + int st_dims[4] = {static_cast(N), + static_cast(C), + static_cast(H_out), + static_cast(W_out)}; + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnSetSpatialTransformerNdDescriptor( + st_desc, CUDNN_SAMPLER_BILINEAR, cudnn_dtype, 4, st_dims)); + + // data pointer + const T* x_data = x.data(); + const T* grid_data = grid.data(); + const T* dy_data = out_grad.data(); + + T* dx_data = dev_ctx.template Alloc(x_grad); + phi::funcs::SetConstant()(dev_ctx, x_grad, static_cast(0)); + + T* dgrid_data = nullptr; + if (grid_grad) { + dgrid_data = dev_ctx.template Alloc(grid_grad); + } + + // alpha/beta + using AlphaBetaT = typename std:: + conditional::value, float, double>::type; + const AlphaBetaT one = static_cast(1.0); + const AlphaBetaT zero = static_cast(0.0); + + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSpatialTfSamplerBackward( + handle, + st_desc, + static_cast(&one), // alpha (for dx) + x_desc, + static_cast(x_data), + static_cast(&zero), // beta (for dx) + dx_desc, + static_cast(dx_data), + static_cast(&one), // alpha (for dgrid) + y_desc, + static_cast(dy_data), + static_cast(grid_data), + static_cast(&zero), // beta (for dgrid) + static_cast(dgrid_data))); + + // resource release + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnDestroySpatialTransformerDescriptor(st_desc)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnDestroyTensorDescriptor(x_desc)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnDestroyTensorDescriptor(dx_desc)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnDestroyTensorDescriptor(y_desc)); + return; + } +#endif + + bool use_int32_index = x.numel() <= std::numeric_limits::max() && + grid.numel() <= std::numeric_limits::max() && + out_grad.numel() <= std::numeric_limits::max(); + + if (x.dims().size() == 4) { + const int64_t n = grid.dims()[0]; + const int64_t out_h = grid.dims()[1]; + const int64_t out_w = grid.dims()[2]; + const int64_t c = x.dims()[1]; + const int64_t in_h = x.dims()[2]; + const int64_t in_w = x.dims()[3]; + + dev_ctx.template Alloc(x_grad); + phi::funcs::SetConstant()(dev_ctx, x_grad, static_cast(0)); + + T* grid_grad_data = nullptr; + if (grid_grad != nullptr) { + grid_grad_data = dev_ctx.template Alloc(grid_grad); + } + + int64_t count = n * out_h * out_w; + auto cu_stream = dev_ctx.stream(); + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count); + +#define LAUNCH_KERNEL(INDEX_TYPE) \ + GridSamplerCudaBackwardKernel \ + <<>>( \ + count, \ + out_grad.data(), \ + x.data(), \ + grid.data(), \ + n, \ + c, \ + out_h, \ + out_w, \ + in_h, \ + in_w, \ + x_grad->data(), \ + grid_grad_data, \ + enum_mode, \ + enum_padding_mode, \ + align_corners); + if (use_int32_index) { + LAUNCH_KERNEL(int32_t) + } else { + LAUNCH_KERNEL(int64_t) + } +#undef LAUNCH_KERNEL + } else { + const int64_t out_d = grid.dims()[1]; + const int64_t out_h = grid.dims()[2]; + const int64_t out_w = grid.dims()[3]; + const int64_t n = x.dims()[0]; + const int64_t c = x.dims()[1]; + const int64_t in_d = x.dims()[2]; + const int64_t in_h = x.dims()[3]; + const int64_t in_w = x.dims()[4]; + + dev_ctx.template Alloc(x_grad); + phi::funcs::SetConstant()(dev_ctx, x_grad, static_cast(0)); + + T* grid_grad_data = nullptr; + if (grid_grad != nullptr) { + grid_grad_data = dev_ctx.template Alloc(grid_grad); + } + + int64_t count = static_cast(n * out_d * out_h * out_w); + auto cu_stream = dev_ctx.stream(); + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count); + +#define LAUNCH_KERNEL(INDEX_TYPE) \ + GridSampler3DCudaBackwardKernel \ + <<>>( \ + count, \ + out_grad.data(), \ + x.data(), \ + grid.data(), \ + c, \ + out_d, \ + out_h, \ + out_w, \ + in_d, \ + in_h, \ + in_w, \ + x_grad->data(), \ + grid_grad_data, \ + enum_mode, \ + enum_padding_mode, \ + align_corners); + if (use_int32_index) { + LAUNCH_KERNEL(int32_t) + } else { + LAUNCH_KERNEL(int64_t) + } +#undef LAUNCH_KERNEL + } +} + +} // namespace phi + +PD_REGISTER_PLUGIN_KERNEL(grid_sample_grad, + metax_gpus, + ALL_LAYOUT, + phi::GridSampleGradKernel, + float, + double) {} diff --git a/backends/metax_gpu/kernels/metax_kernel/grid_sample_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/grid_sample_kernel_register.cu new file mode 100644 index 00000000000..71050c264c6 --- /dev/null +++ b/backends/metax_gpu/kernels/metax_kernel/grid_sample_kernel_register.cu @@ -0,0 +1,527 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "glog/logging.h" +#include "kernels/metax_kernel/metax_context.h" +#include "paddle/phi/backends/dynload/cudnn.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/grid_sample_utils.h" +#include "paddle/phi/kernels/grid_sample_kernel.h" + +namespace phi { + +template +static __forceinline__ __device__ T Unnormalize(T coord, + IndexT size, + bool align_corners) { + return align_corners ? ((coord + 1.f) / 2) * (size - 1) + : ((coord + 1.f) * size - 1) / 2; +} + +template +static __forceinline__ __device__ T ClipIndexes(T in, IndexT max_value) { + return min(static_cast(max_value - 1), max(in, static_cast(0))); +} + +template +static __forceinline__ __device__ T ReflectIndexes(T in, + IndexT twice_low, + IndexT twice_high) { + if (twice_low == twice_high) { + return static_cast(0); + } + T min = static_cast(twice_low) / 2; + T span = static_cast(twice_high - twice_low) / 2; + in = fabs(in - min); + T extra = fmod(in, span); + IndexT flips = floor(in / span); + return (flips & 1) ? span - extra + min : extra + min; // cond ? odd : even +} + +template +static __forceinline__ __device__ T ComputePositions(T coord, + IndexT size, + PaddingMode padding_mode, + bool align_corners) { + coord = Unnormalize(coord, size, align_corners); + if (padding_mode == PaddingMode::border) { + coord = ClipIndexes(coord, size); + } else if (padding_mode == PaddingMode::reflect) { + coord = align_corners ? ReflectIndexes(coord, 0, 2 * (size - 1)) + : ReflectIndexes(coord, -1, 2 * size - 1); + coord = ClipIndexes(coord, size); + } + return SafeDownGradeToIntRange(coord); +} + +template +__global__ void GridSampleCudaKernel(IndexT n, + IndexT out_c, + IndexT out_hw, + IndexT in_h, + IndexT in_w, + const T* __restrict__ input, + const T* __restrict__ grid, + T* __restrict__ output, + const Mode mode, + const PaddingMode padding_mode, + bool align_corners) { + IndexT nthreads = n * out_hw; + IndexT inp_sN = out_c * (in_h * in_w); + IndexT inp_sC = in_h * in_w; + IndexT inp_sH = in_w; + IndexT inp_sW = 1; + IndexT grid_sNHW = 2; + IndexT grid_sCoor = 1; + IndexT out_sN = out_c * out_hw; + IndexT out_sC = out_hw; + IndexT out_sHW = 1; + CUDA_KERNEL_LOOP_TYPE(index, nthreads, IndexT) { + const IndexT hw = index % out_hw; + const IndexT n = index / out_hw; + const IndexT grid_offset = index * grid_sNHW; + + T ix = grid[grid_offset]; + T iy = grid[grid_offset + grid_sCoor]; + + ix = ComputePositions(ix, in_w, padding_mode, align_corners); + iy = ComputePositions(iy, in_h, padding_mode, align_corners); + if (mode == Mode::bilinear) { + IndexT ix_nw = floor(ix); + IndexT iy_nw = floor(iy); + IndexT ix_ne = ix_nw + 1; + IndexT iy_ne = iy_nw; + IndexT ix_sw = ix_nw; + IndexT iy_sw = iy_nw + 1; + IndexT ix_se = ix_nw + 1; + IndexT iy_se = iy_nw + 1; + + T nw = (ix_se - ix) * (iy_se - iy); + T ne = (ix - ix_sw) * (iy_sw - iy); + T sw = (ix_ne - ix) * (iy - iy_ne); + T se = (ix - ix_nw) * (iy - iy_nw); + + IndexT inp_offset_NC = n * inp_sN; + T* out_ptr_NCHW = output + (n * out_sN + hw * out_sHW); + + for (IndexT c = 0; c < out_c; + ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { + T value{0}; + if (InBounds(iy_nw, ix_nw, in_h, in_w)) { + value += input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw; + } + if (InBounds(iy_ne, ix_ne, in_h, in_w)) { + value += input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne; + } + if (InBounds(iy_sw, ix_sw, in_h, in_w)) { + value += input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw; + } + if (InBounds(iy_se, ix_se, in_h, in_w)) { + value += input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se; + } + *out_ptr_NCHW = value; + } + } else if (mode == Mode::nearest) { + IndexT ix_nearest = std::nearbyint(ix); + IndexT iy_nearest = std::nearbyint(iy); + IndexT inp_offset_NC = n * inp_sN; + T* out_ptr_NCHW = output + (n * out_sN + hw * out_sHW); + for (IndexT c = 0; c < out_c; + ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { + if (InBounds(iy_nearest, ix_nearest, in_h, in_w)) { + *out_ptr_NCHW = + input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW]; + } else { + *out_ptr_NCHW = static_cast(0); + } + } + } + } +} + +template +__global__ void GridSample3DCudaKernel(const IndexT nthreads, + IndexT out_c, + IndexT out_d, + IndexT out_h, + IndexT out_w, + IndexT in_d, + IndexT in_h, + IndexT in_w, + const T* input, + const T* grid, + T* output, + const Mode interpolation_mode, + const PaddingMode padding_mode, + bool align_corners) { + IndexT inp_sW = 1; + IndexT inp_sH = in_w; + IndexT inp_sD = in_h * in_w; + IndexT inp_sC = in_d * inp_sD; + IndexT inp_sN = out_c * inp_sC; + + IndexT grid_sCoor = 1; + IndexT grid_sW = 3; + IndexT grid_sH = out_w * grid_sW; + IndexT grid_sD = out_h * grid_sH; + IndexT grid_sN = out_d * grid_sD; + + IndexT out_sW = 1; + IndexT out_sH = out_w; + IndexT out_sD = out_h * out_w; + IndexT out_sC = out_d * out_sD; + IndexT out_sN = out_c * out_sC; + + CUDA_KERNEL_LOOP_TYPE(index, nthreads, IndexT) { + const IndexT w = index % out_w; + const IndexT h = (index / out_w) % out_h; + const IndexT d = (index / (out_h * out_w)) % out_d; + const IndexT n = index / (out_d * out_h * out_w); + const IndexT grid_offset = + n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW; + // get the corresponding input x, y, z coordinates from grid + T ix = grid[grid_offset]; + T iy = grid[grid_offset + grid_sCoor]; + T iz = grid[grid_offset + 2 * grid_sCoor]; + ix = ComputePositions(ix, in_w, padding_mode, align_corners); + iy = ComputePositions(iy, in_h, padding_mode, align_corners); + iz = ComputePositions(iz, in_d, padding_mode, align_corners); + if (interpolation_mode == Mode::bilinear) { + // get corner pixel values from (x, y, z) + // for 4d, we used north-east-south-west + // for 5d, we add top-bottom + IndexT ix_tnw = static_cast(std::floor(ix)); + IndexT iy_tnw = static_cast(std::floor(iy)); + IndexT iz_tnw = static_cast(std::floor(iz)); + + IndexT ix_tne = ix_tnw + 1; + IndexT iy_tne = iy_tnw; + IndexT iz_tne = iz_tnw; + + IndexT ix_tsw = ix_tnw; + IndexT iy_tsw = iy_tnw + 1; + IndexT iz_tsw = iz_tnw; + + IndexT ix_tse = ix_tnw + 1; + IndexT iy_tse = iy_tnw + 1; + IndexT iz_tse = iz_tnw; + + IndexT ix_bnw = ix_tnw; + IndexT iy_bnw = iy_tnw; + IndexT iz_bnw = iz_tnw + 1; + + IndexT ix_bne = ix_tnw + 1; + IndexT iy_bne = iy_tnw; + IndexT iz_bne = iz_tnw + 1; + + IndexT ix_bsw = ix_tnw; + IndexT iy_bsw = iy_tnw + 1; + IndexT iz_bsw = iz_tnw + 1; + + IndexT ix_bse = ix_tnw + 1; + IndexT iy_bse = iy_tnw + 1; + IndexT iz_bse = iz_tnw + 1; + + // get surfaces to each neighbor: + T tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); + T tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); + T tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); + T tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); + T bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); + T bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); + T bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); + T bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); + + const T* inp_ptr_NC = input + n * inp_sN; + T* out_ptr_NCDHW = + output + (n * out_sN + d * out_sD + h * out_sH + w * out_sW); + for (IndexT c = 0; c < out_c; + ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) { + *out_ptr_NCDHW = static_cast(0); + if (InBounds3D(iz_tnw, iy_tnw, ix_tnw, in_d, in_h, in_w)) { + *out_ptr_NCDHW += + inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * + tnw; + } + if (InBounds3D(iz_tne, iy_tne, ix_tne, in_d, in_h, in_w)) { + *out_ptr_NCDHW += + inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * + tne; + } + if (InBounds3D(iz_tsw, iy_tsw, ix_tsw, in_d, in_h, in_w)) { + *out_ptr_NCDHW += + inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * + tsw; + } + if (InBounds3D(iz_tse, iy_tse, ix_tse, in_d, in_h, in_w)) { + *out_ptr_NCDHW += + inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * + tse; + } + if (InBounds3D(iz_bnw, iy_bnw, ix_bnw, in_d, in_h, in_w)) { + *out_ptr_NCDHW += + inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * + bnw; + } + if (InBounds3D(iz_bne, iy_bne, ix_bne, in_d, in_h, in_w)) { + *out_ptr_NCDHW += + inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * + bne; + } + if (InBounds3D(iz_bsw, iy_bsw, ix_bsw, in_d, in_h, in_w)) { + *out_ptr_NCDHW += + inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * + bsw; + } + if (InBounds3D(iz_bse, iy_bse, ix_bse, in_d, in_h, in_w)) { + *out_ptr_NCDHW += + inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * + bse; + } + } + } else if (interpolation_mode == Mode::nearest) { + IndexT ix_nearest = static_cast(std::nearbyint(ix)); + IndexT iy_nearest = static_cast(std::nearbyint(iy)); + IndexT iz_nearest = static_cast(std::nearbyint(iz)); + + // assign nearest neighbor pixel value to output pixel + const T* inp_ptr_NC = input + n * inp_sN; + T* out_ptr_NCDHW = + output + (n * out_sN + d * out_sD + h * out_sH + w * out_sW); + for (IndexT c = 0; c < out_c; + ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) { + if (InBounds3D(iz_nearest, iy_nearest, ix_nearest, in_d, in_h, in_w)) { + *out_ptr_NCDHW = + inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH + + ix_nearest * inp_sW]; + } else { + *out_ptr_NCDHW = static_cast(0); + } + } + } + } +} + +template +void GridSampleKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& grid, + const std::string& mode, + const std::string& padding_mode, + bool align_corners, + DenseTensor* out) { + if (out && out->numel() == 0) { + dev_ctx.template Alloc(out); + return; + } + PaddingMode enum_padding_mode; + Mode enum_mode; + if (padding_mode == "border") { + enum_padding_mode = PaddingMode::border; + } else if (padding_mode == "reflection") { + enum_padding_mode = PaddingMode::reflect; + } else { + enum_padding_mode = PaddingMode::zeros; + } + + if (mode == "nearest") { + enum_mode = Mode::nearest; + } else { + enum_mode = Mode::bilinear; + } + +#ifndef PADDLE_WITH_HIP + if (condCudnnGridSampler(x, grid) && + enum_padding_mode == PaddingMode::zeros && enum_mode == Mode::bilinear && + align_corners) { + const int64_t N = x.dims()[0]; + const int64_t C = x.dims()[1]; + const int64_t H_in = x.dims()[2]; + const int64_t W_in = x.dims()[3]; + const int64_t H_out = grid.dims()[1]; + const int64_t W_out = grid.dims()[2]; + + out->Resize({N, C, H_out, W_out}); + auto* out_data = dev_ctx.template Alloc(out); + + cudnnHandle_t handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); + + // Create and set Tensor descriptors (NCHW) for x and out + cudnnTensorDescriptor_t x_desc, y_desc; + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnCreateTensorDescriptor(&x_desc)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnCreateTensorDescriptor(&y_desc)); + + const cudnnDataType_t cudnn_dtype = + std::is_same::value ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE; + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnSetTensor4dDescriptor(x_desc, + CUDNN_TENSOR_NCHW, + cudnn_dtype, + static_cast(N), + static_cast(C), + static_cast(H_in), + static_cast(W_in))); + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnSetTensor4dDescriptor(y_desc, + CUDNN_TENSOR_NCHW, + cudnn_dtype, + static_cast(N), + static_cast(C), + static_cast(H_out), + static_cast(W_out))); + + // Spatial Transformer descriptor: specifies sampler type and output + // dimension (N, C, H_out, W_out) + cudnnSpatialTransformerDescriptor_t st_desc; + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnCreateSpatialTransformerDescriptor(&st_desc)); + int st_dims[4] = {static_cast(N), + static_cast(C), + static_cast(H_out), + static_cast(W_out)}; + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnSetSpatialTransformerNdDescriptor( + st_desc, CUDNN_SAMPLER_BILINEAR, cudnn_dtype, 4, st_dims)); + + const T* x_data = x.data(); + const T* grid_data = grid.data(); + using AlphaBetaT = typename std:: + conditional::value, float, double>::type; + const AlphaBetaT alpha = static_cast(1.0); + const AlphaBetaT beta = static_cast(0.0); + + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSpatialTfSamplerForward( + handle, + st_desc, + static_cast(&alpha), + x_desc, + static_cast(x_data), + static_cast(grid_data), + static_cast(&beta), + y_desc, + static_cast(out_data))); + + // resource release + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnDestroySpatialTransformerDescriptor(st_desc)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnDestroyTensorDescriptor(x_desc)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnDestroyTensorDescriptor(y_desc)); + return; + } +#endif + + bool use_int32_index = x.numel() <= std::numeric_limits::max() && + grid.numel() <= std::numeric_limits::max() && + out->numel() <= std::numeric_limits::max(); + + if (x.dims().size() == 4) { + const int64_t n = grid.dims()[0]; + const int64_t out_h = grid.dims()[1]; + const int64_t out_w = grid.dims()[2]; + const int64_t c = x.dims()[1]; + const int64_t in_h = x.dims()[2]; + const int64_t in_w = x.dims()[3]; + VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h + << "; out_w: " << out_w; + + auto* output_data = dev_ctx.template Alloc(out); + VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; " + << out->dims()[2] << "; " << out->dims()[3]; + + int64_t count = n * out_h * out_w; + auto cu_stream = dev_ctx.stream(); + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count); + +#define LAUNCH_KERNEL(INDEX_TYPE) \ + GridSampleCudaKernel \ + <<>>( \ + n, \ + c, \ + out_h * out_w, \ + in_h, \ + in_w, \ + x.data(), \ + grid.data(), \ + output_data, \ + enum_mode, \ + enum_padding_mode, \ + align_corners) + if (use_int32_index) { + LAUNCH_KERNEL(int); + } else { + LAUNCH_KERNEL(int64_t); + } +#undef LAUNCH_KERNEL + } else { + const int64_t n = grid.dims()[0]; + const int64_t out_d = grid.dims()[1]; + const int64_t out_h = grid.dims()[2]; + const int64_t out_w = grid.dims()[3]; + const int64_t c = x.dims()[1]; + const int64_t in_d = x.dims()[2]; + const int64_t in_h = x.dims()[3]; + const int64_t in_w = x.dims()[4]; + + VLOG(3) << "n: " << n << "; c: " << c << "; out_d: " << out_d + << "; out_h: " << out_h << "; out_w: " << out_w; + + auto* output_data = dev_ctx.template Alloc(out); + VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; " + << out->dims()[2] << "; " << out->dims()[3] << "; " + << out->dims()[4]; + + int64_t count = n * out_d * out_h * out_w; + auto cu_stream = dev_ctx.stream(); + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count); + +#define LAUNCH_KERNEL(INDEX_TYPE) \ + GridSample3DCudaKernel \ + <<>>( \ + count, \ + c, \ + out_d, \ + out_h, \ + out_w, \ + in_d, \ + in_h, \ + in_w, \ + x.data(), \ + grid.data(), \ + output_data, \ + enum_mode, \ + enum_padding_mode, \ + align_corners) + if (use_int32_index) { + LAUNCH_KERNEL(int); + } else { + LAUNCH_KERNEL(int64_t); + } +#undef LAUNCH_KERNEL + } +} + +} // namespace phi + +PD_REGISTER_PLUGIN_KERNEL( + grid_sample, metax_gpu, ALL_LAYOUT, phi::GridSampleKernel, float, double) {} diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu index eae8c8c0301..d2f39ccf751 100644 --- a/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu +++ b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu @@ -35,6 +35,7 @@ void WeightOnlyLinearKernel(const Context& dev_ctx, const int32_t group_size, DenseTensor* out) { dev_ctx.template Alloc(out); + auto stream = dev_ctx.stream(); const T* x_data = x.data(); const int8_t* weight_data = weight.data(); const T* bias_data = bias ? bias.get().data() : nullptr; @@ -128,7 +129,7 @@ void WeightOnlyLinearKernel(const Context& dev_ctx, k, n, n}; - mctlass_op(arguments); + mctlass_op(arguments, NULL, stream); } else { mctlassGemmScaleOp_w8a16_bias mctlass_op; typename mctlassGemmScaleOp_w8a16_bias::Arguments arguments{ From 89912995a39f939a582aeb953f761a588c89663d Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Tue, 23 Sep 2025 20:02:41 +0800 Subject: [PATCH 45/58] =?UTF-8?q?change=E2=80=94ut=20(#59)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * change_ut --- backends/metax_gpu/tests/default.txt | 9 --------- 1 file changed, 9 deletions(-) diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt index 9f073d7e92f..9c989161fed 100644 --- a/backends/metax_gpu/tests/default.txt +++ b/backends/metax_gpu/tests/default.txt @@ -42,7 +42,6 @@ test_shape_op test_tril_triu_op test_slice_op test_elementwise_add_op -test_index_put_op test_bincount_op test_assign_op test_logical_op @@ -73,7 +72,6 @@ test_fractional_max_pool3d_api test_nll_loss test_is_empty_op test_norm_nn_grad -test_index_fill test_floor test_slice_scatter test_nn_matmul_v2_grad @@ -127,10 +125,8 @@ test_flip test_fused_bias_dropout_residual_layer_norm_op test_greater_equal_op test_add_op -test_cartesian_prod test_uniform_random_inplace_op test_feed_fetch_method -test_pow_op test_conv3d_transpose_op test_add_position_encoding_op test_imperative_data_loader_base @@ -223,12 +219,9 @@ test_executor_check_fetch_list test_inplace_softmax_with_cross_entropy test_cos test_imperative_parallel_coalesce_split -test_grid_sample_function -test_rnn_decode_api test_triu_indices_op test_binary_cross_entropy_with_logits_op test_mean_op_v1 -test_round_op test_assign_pos_op_dygraph test_nn_functional_embedding_static test_norm_op @@ -262,7 +255,6 @@ test_diag_v2 test_complex_transpose test_prior_box_op test_square_error_cost -test_fused_rotary_position_embedding test_gru_rnn_op test_restrict_nonzero test_dygraph_weight_norm @@ -295,7 +287,6 @@ test_argsort_op test_layer_norm_op_v2 test_adaptive_max_pool1d test_shard_index_op -test_cuda_max_memory_allocated test_roi_align_op test_sin test_take From a770e6f197e8c519712a4a7d2359110d34dc0431 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Tue, 23 Sep 2025 20:50:24 +0800 Subject: [PATCH 46/58] change_ut (#60) * change_ut --------- --- backends/metax_gpu/tests/default.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt index 9c989161fed..21adad68f5b 100644 --- a/backends/metax_gpu/tests/default.txt +++ b/backends/metax_gpu/tests/default.txt @@ -28,7 +28,6 @@ test_one_hot_v2_op test_fill_any_op test_gather_op test_reshape_op -test_index_put_op test_bitwise_op test_max_op test_pad_op @@ -214,7 +213,6 @@ test_tile_op test_adam_optimizer_fp32_fp64 test_batch_norm_op test_gather_nd_op -test_pow test_executor_check_fetch_list test_inplace_softmax_with_cross_entropy test_cos From 902112bb8707edebefa747e4994384df27c3f356 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Wed, 24 Sep 2025 10:05:05 +0800 Subject: [PATCH 47/58] change_ut (#63) * change_ut * change_ut --------- --- backends/metax_gpu/tests/default.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt index 21adad68f5b..54f0b7c008f 100644 --- a/backends/metax_gpu/tests/default.txt +++ b/backends/metax_gpu/tests/default.txt @@ -177,7 +177,6 @@ test_imperative_data_parallel test_sigmoid test_adaptive_max_pool3d test_roll_op -test_index_put_op test_assign_op test_amp_check_finite_and_scale_op test_strided_slice_op From cfe44ce24e2e67c595057e0568b7c34f55c08b0a Mon Sep 17 00:00:00 2001 From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Date: Thu, 25 Sep 2025 16:04:11 +0800 Subject: [PATCH 48/58] [Metax] add keyword filter in CI CMakeLists.txt (#64) * [Metax] add keyword filter in CI CMakeLists.txt * [Metax] add ignore case list --- backends/metax_gpu/tests/CMakeLists.txt | 62 ++++++++++++------------- backends/metax_gpu/tests/ignore.txt | 21 +++++++++ 2 files changed, 50 insertions(+), 33 deletions(-) create mode 100644 backends/metax_gpu/tests/ignore.txt diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt index e8b11d347d9..0c84ada4b65 100755 --- a/backends/metax_gpu/tests/CMakeLists.txt +++ b/backends/metax_gpu/tests/CMakeLists.txt @@ -9,6 +9,8 @@ set(PADDLE_LEGACY_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test) set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test) +set(NEED_REMOVE_KEYWORDS "attention") + file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py") if(NOT TEST_LIST_FILE) @@ -33,6 +35,20 @@ else() endif() foreach(test_name ${TEST_PROGRAMS}) + set(IS_REMOVE FALSE) + + foreach(keyword ${NEED_REMOVE_KEYWORDS}) + string(FIND "${test_name}" "${keyword}" RES) + if(NOT RES EQUAL -1) + set(IS_REMOVE TRUE) + break() + endif() + endforeach() + + if(IS_REMOVE) + continue() + endif() + set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py) if(NOT EXISTS ${CURRENT_TEST_PROGRAM}) message(WARNING "${CURRENT_TEST_PROGRAM} is not exist, skip it.") @@ -44,39 +60,19 @@ endforeach() list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS) if(NOT TEST_LIST_FILE) - list( - REMOVE_ITEM - PYTHON_TEST_SCRIPTS - # Metax unit test - ${METAX_UNIT_TEST_PATH}/test_matmul_op_metax.py - # 精度问题 - ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py - # core.cudnnversion - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py - # op_test.py 里 self._get_places()接口的适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py - # device == "gpu" 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py - # paddle-gpu 报错一致 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py - # paddle.device.cuda.get_device_properties - ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py - # needs check_grad with fp64 precision - ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py - # CUDAPinnedPlace 问题 - ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py) + set(NEED_IGNORE_FILE ${CMAKE_CURRENT_LIST_DIR}/ignore.txt) + if(EXISTS ${NEED_IGNORE_FILE}) + file(STRINGS ${NEED_IGNORE_FILE} NEED_IGNORE_TEST_PROGRAMS) + foreach(test_name ${NEED_IGNORE_TEST_PROGRAMS}) + if(EXISTS ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py) + list(REMOVE_ITEM PYTHON_TEST_SCRIPTS + ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py) + else() + list(REMOVE_ITEM PYTHON_TEST_SCRIPTS + ${METAX_UNIT_TEST_PATH}/${test_name}.py) + endif() + endforeach() + endif() endif() if(LOG_OUTPUT_DIR AND NOT EXISTS ${LOG_OUTPUT_DIR}) diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt new file mode 100644 index 00000000000..b4f1afbe5b0 --- /dev/null +++ b/backends/metax_gpu/tests/ignore.txt @@ -0,0 +1,21 @@ +test_matmul_op_metax +test_sum_op +test_max_op +test_cumsum_op +test_softmax_with_cross_entropy_op +test_softmax_op +test_elementwise_add_op +test_gather_op +test_elementwise_pow_op +test_layer_norm_op +test_index_add_op +test_elementwise_div_op +test_stack_op +test_logical_op +test_mean_op +test_transpose_op +test_randint_op +test_uniform_random_op +test_c_embedding_op +test_slice_op +test_compare_op From 78946fd334dacbdb3f8ba9b07d9273a8462e8512 Mon Sep 17 00:00:00 2001 From: jxwangmetax <189149612@qq.com> Date: Fri, 26 Sep 2025 15:48:08 +0800 Subject: [PATCH 49/58] [metax] modify kernels (#67) * modify cmake for warpctc and warprnnt * modify conv for tf32 and fp32 * modify conv kernel * modify library to static library * modify kernel * modify fused_bias_dropout_residual_layer_norm * modify compile * modify blas * modify blas * modify blas * modify blas * modify context * modify kernels --- .../fused_conv2d_add_act_kernel_register.cu | 0 .../fused_rope_grad_kernel_register.cu | 0 .../fused_rope_kernel_register.cu | 0 .../kernels/metax_kernel/metax_context.cc | 26 ------------------- .../kernels/metax_kernel/metax_context.h | 3 +-- 5 files changed, 1 insertion(+), 28 deletions(-) rename backends/metax_gpu/kernels/{metax_kernel => fusion}/fused_conv2d_add_act_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{metax_kernel => fusion}/fused_rope_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{metax_kernel => fusion}/fused_rope_kernel_register.cu (100%) diff --git a/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_conv2d_add_act_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_conv2d_add_act_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_rope_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_rope_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_rope_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_rope_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc index efddba5f00b..0712fb75bbe 100644 --- a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc +++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc @@ -15,24 +15,6 @@ #include "kernels/metax_kernel/metax_context.h" namespace phi { -const bool allow_tf32_cublas = []() -> bool { - const char* v = std::getenv("ALLOW_TF32_CUBLAS"); - if (v) { - return std::atoi(v); - } - return true; -}(); - -const bool allow_tf32_cudnn = []() -> bool { - const char* v = std::getenv("ALLOW_TF32_CUDNN"); - if (v) { - return std::atoi(v); - } - return false; -}(); - -bool AllowTF32Cublas() { return allow_tf32_cublas; } -bool AllowTF32Cudnn() { return allow_tf32_cudnn; } void DnnWorkspaceHandle::RunFuncSync( const std::function& cudnn_func, size_t required_workspace_bytes, @@ -42,19 +24,11 @@ void DnnWorkspaceHandle::RunFuncSync( void* workspace_ptr = nullptr; size_t size = ((required_workspace_bytes + 255) >> 8) << 8; std::lock_guard guard(*mtx_); -#ifdef PADDLE_WITH_HIP - auto status = hipMalloc(&workspace_ptr, size); -#else auto status = cudaMalloc(&workspace_ptr, size); -#endif if (status == gpuSuccess) { cudnn_func(workspace_ptr); phi::backends::gpu::GpuStreamSync(stream_); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS(hipFree(workspace_ptr)); -#else PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(workspace_ptr)); -#endif return; } } diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h index 2d761439089..7386811a236 100644 --- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h +++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h @@ -18,6 +18,7 @@ #include #include "kernels/funcs/blas/cublasLt.h" +#include "paddle/phi/backends/context_pool.h" #include "paddle/phi/backends/custom/custom_context.h" #include "paddle/phi/backends/gpu/forwards.h" #include "paddle/phi/backends/gpu/gpu_decls.h" @@ -30,8 +31,6 @@ cublasLtHandle_t GetBlasLtHandle(); namespace phi { -bool AllowTF32Cublas(); -bool AllowTF32Cudnn(); class DnnWorkspaceHandle { public: inline DnnWorkspaceHandle(Allocator* allocator, gpuStream_t stream) From ac78af20874e28a7d5c3f1beed40762c716213bb Mon Sep 17 00:00:00 2001 From: Theendlessofhell <148317258+Theendlessofhell@users.noreply.github.com> Date: Fri, 26 Sep 2025 15:48:59 +0800 Subject: [PATCH 50/58] Fix part of the missing kernel issues (#66) Co-authored-by: root --- .../kernels/cuda_kernels/multinomial_kernel_register.cu | 3 ++- .../kernels/cuda_kernels/take_along_axis_kernel_register.cu | 5 ++++- .../metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu | 1 + .../kernels/metax_kernel/layer_norm_grad_kernel_register.cu | 1 + 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/backends/metax_gpu/kernels/cuda_kernels/multinomial_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/multinomial_kernel_register.cu index 622e70728f1..1325fa339b0 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/multinomial_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/multinomial_kernel_register.cu @@ -21,6 +21,7 @@ PD_CUSTOM_KERNEL_REGISTER(multinomial, phi::MultinomialKernel, phi::dtype::float16, phi::dtype::bfloat16, - float) { + float, + double) { kernel->OutputAt(0).SetDataType(phi::DataType::INT64); } diff --git a/backends/metax_gpu/kernels/cuda_kernels/take_along_axis_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/take_along_axis_kernel_register.cu index 4b23b0820fc..b628552aaaf 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/take_along_axis_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/take_along_axis_kernel_register.cu @@ -25,4 +25,7 @@ PD_CUSTOM_KERNEL_REGISTER(take_along_axis, int64_t, int, phi::dtype::float16, - phi::dtype::bfloat16) {} + phi::dtype::bfloat16, + uint8_t, // 支持 uint8 + int16_t // 支持 int16 +) {} diff --git a/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu index 287fa8de41a..ead21b1eb7e 100644 --- a/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu @@ -22,5 +22,6 @@ PD_REGISTER_PLUGIN_KERNEL(addmm, ALL_LAYOUT, phi::AddmmKernel, float, + double, phi::dtype::float16, phi::dtype::bfloat16) {} diff --git a/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu index 87c06dab2a4..857dcb6d522 100644 --- a/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu @@ -115,6 +115,7 @@ PD_REGISTER_PLUGIN_KERNEL(layer_norm_grad, ALL_LAYOUT, phi::LayerNormGradKernel, float, + double, phi::dtype::float16, phi::dtype::bfloat16) { if (kernel_key.dtype() == phi::DataType::FLOAT16) { From 4ce9fe6de10402f04917cae8bd0f83bf499bdf1e Mon Sep 17 00:00:00 2001 From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com> Date: Fri, 26 Sep 2025 18:18:36 +0800 Subject: [PATCH 51/58] [Metax] fix index_elementwise_get kernel (#68) * [Metax] add keyword filter in CI CMakeLists.txt * [Metax] add ignore case list * [Metax] fix phi::backends::gpu::DnnVersion() symbol not found * Revert "[Metax] fix phi::backends::gpu::DnnVersion() symbol not found" This reverts commit 087a9c1240f024210d536e543a2fc55db1175529. * [Metax] fix index_elementwise_get kernel --- backends/metax_gpu/CMakeLists.txt | 2 +- .../index_elementwise_get_kernel_register.cu | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index bca1ce7aad4..3b74ae39c18 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -326,7 +326,7 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/increment_kernel.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu + # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu index 5ab3d2a3170..a45a740fc61 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/index_elementwise_get_kernel.h" +#include "paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu" // NOLINT PD_CUSTOM_KERNEL_REGISTER(index_elementwise_get, metax_gpu, @@ -27,7 +27,7 @@ PD_CUSTOM_KERNEL_REGISTER(index_elementwise_get, int64_t, int16_t, uint8_t, - phi::dtype::float16, - phi::dtype::bfloat16, - phi::dtype::complex, - phi::dtype::complex) {} + phi::float16, + phi::bfloat16, + phi::complex64, + phi::complex128) {} From 3c8d0173075d49bef48a909a39f12d325e276f00 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Mon, 29 Sep 2025 10:42:05 +0800 Subject: [PATCH 52/58] [metax]fix patch and fix missing kernel (#72) * [metax]fix patch and fix missing kernel --- backends/metax_gpu/CMakeLists.txt | 3 + .../cuda_kernels/adam_kernel_selected_rows.cu | 41 ++++++++++++ .../cuda_kernels/einsum_kernel_register.cu | 16 ++--- .../lars_momentum_kernel_register.cu | 29 +++++++++ .../cuda_kernels/nonzero_kernel_register.cu | 8 ++- .../put_along_axis_kernel_register.cu | 6 +- backends/metax_gpu/patch/paddle.patch | 65 ------------------- 7 files changed, 90 insertions(+), 78 deletions(-) create mode 100644 backends/metax_gpu/kernels/cuda_kernels/adam_kernel_selected_rows.cu create mode 100644 backends/metax_gpu/kernels/cuda_kernels/lars_momentum_kernel_register.cu diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index 3b74ae39c18..5930eaaebd2 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -535,6 +535,7 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/clip_by_norm_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/uniform_random_batch_size_like_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/get_tensor_from_selected_rows_kernel.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/batch_norm_kernel.cc ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/empty_kernel.cc @@ -642,6 +643,8 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/lars_momentum_kernel.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/partial_sum_kernel.cu # ############################################################################ ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu # kernels/kps diff --git a/backends/metax_gpu/kernels/cuda_kernels/adam_kernel_selected_rows.cu b/backends/metax_gpu/kernels/cuda_kernels/adam_kernel_selected_rows.cu new file mode 100644 index 00000000000..df4105efbd2 --- /dev/null +++ b/backends/metax_gpu/kernels/cuda_kernels/adam_kernel_selected_rows.cu @@ -0,0 +1,41 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/selected_rows_functor.h" +#include "paddle/phi/kernels/selected_rows/adam_kernel.h" + +PD_CUSTOM_KERNEL_REGISTER(adam_dense_param_sparse_grad, + metax_gpu, + ALL_LAYOUT, + phi::sr::AdamDenseParamSparseGradKernel, + float, + double, + phi::float16) { + // Skip beta1_pow, beta2_pow, skip_update data transform + kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(7).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(9).SetBackend(phi::Backend::ALL_BACKEND); + + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(6).SetDataType(phi::DataType::FLOAT32); + } + kernel->OutputAt(4).SetBackend(phi::Backend::UNDEFINED); + kernel->OutputAt(5).SetBackend(phi::Backend::UNDEFINED); +} diff --git a/backends/metax_gpu/kernels/cuda_kernels/einsum_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/einsum_kernel_register.cu index 444928af78f..0f613b55e9e 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/einsum_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/einsum_kernel_register.cu @@ -23,10 +23,10 @@ PD_CUSTOM_KERNEL_REGISTER(einsum, phi::EinsumKernel, float, double, - phi::dtype::float16, - phi::dtype::bfloat16, - phi::dtype::complex, - phi::dtype::complex) {} + phi::float16, + phi::bfloat16, + phi::complex64, + phi::complex128) {} PD_CUSTOM_KERNEL_REGISTER(einsum_infer, metax_gpu, @@ -34,7 +34,7 @@ PD_CUSTOM_KERNEL_REGISTER(einsum_infer, phi::EinsumInferKernel, float, double, - phi::dtype::float16, - phi::dtype::bfloat16, - phi::dtype::complex, - phi::dtype::complex) {} + phi::float16, + phi::bfloat16, + phi::complex64, + phi::complex128) {} diff --git a/backends/metax_gpu/kernels/cuda_kernels/lars_momentum_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/lars_momentum_kernel_register.cu new file mode 100644 index 00000000000..5647c806bfd --- /dev/null +++ b/backends/metax_gpu/kernels/cuda_kernels/lars_momentum_kernel_register.cu @@ -0,0 +1,29 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/lars_momentum_kernel.h" + +PD_CUSTOM_KERNEL_REGISTER(lars_momentum, + metax_gpu, + ALL_LAYOUT, + phi::LarsMomentumKernel, + float, + double, + phi::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + } +} diff --git a/backends/metax_gpu/kernels/cuda_kernels/nonzero_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/nonzero_kernel_register.cu index 1f84b628e84..dc92b2c6d69 100755 --- a/backends/metax_gpu/kernels/cuda_kernels/nonzero_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/nonzero_kernel_register.cu @@ -23,11 +23,13 @@ PD_CUSTOM_KERNEL_REGISTER(nonzero, int64_t, int, int16_t, - phi::dtype::float16, - phi::dtype::bfloat16, + phi::float16, + phi::bfloat16, bool, float, - double) { + double, + phi::complex64, + phi::complex128) { kernel->OutputAt(0).SetDataType(phi::DataType::INT64); } diff --git a/backends/metax_gpu/kernels/cuda_kernels/put_along_axis_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/put_along_axis_kernel_register.cu index 8ff1f5959ab..ca93a8ca079 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/put_along_axis_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/put_along_axis_kernel_register.cu @@ -23,6 +23,8 @@ PD_CUSTOM_KERNEL_REGISTER(put_along_axis, float, double, int64_t, + uint8_t, + int16_t, int, - phi::dtype::float16, - phi::dtype::bfloat16) {} + phi::float16, + phi::bfloat16) {} diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index beefb730bf7..4c06609338c 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -869,19 +869,6 @@ index e838778952..83e805e75a 100644 namespace phi { namespace fusion { -diff --git a/paddle/phi/kernels/gpu/correlation_kernel.cu b/paddle/phi/kernels/gpu/correlation_kernel.cu -index 4c93778bde..c7bdf8a2cc 100644 ---- a/paddle/phi/kernels/gpu/correlation_kernel.cu -+++ b/paddle/phi/kernels/gpu/correlation_kernel.cu -@@ -103,7 +103,7 @@ void CorrelationCUDAKernel(const Context &dev_ctx, - int stride2, - int corr_type_multiply, - DenseTensor *out) { -- bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU; -+ bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM; - PADDLE_ENFORCE_EQ( - is_gpu_place, - true, diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h index f0cca0f701..02ea957240 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv.h @@ -897,19 +884,6 @@ index f0cca0f701..02ea957240 100644 namespace phi { // To determine use cudnn or not. -diff --git a/paddle/phi/kernels/gpu/dgc_kernel.cu b/paddle/phi/kernels/gpu/dgc_kernel.cu -index c2ddfa1347..c6adf5a6de 100644 ---- a/paddle/phi/kernels/gpu/dgc_kernel.cu -+++ b/paddle/phi/kernels/gpu/dgc_kernel.cu -@@ -188,7 +188,7 @@ void DGCKernel(const Context& dev_ctx, - int buf_size = paddle::communication::dgc::get_buffer_size(k); - phi::Allocator::AllocationPtr tmp_ious_data; - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -- if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { -+ if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) { - tmp_ious_data = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - buf_size, diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h index 29fa252e96..4ae72b0935 100644 --- a/paddle/phi/kernels/gpu/gelu_funcs.h @@ -974,19 +948,6 @@ index 1bdbe1564c..f753b54bc6 100644 #include "paddle/phi/kernels/impl/qr_kernel_impl.h" #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" #include "paddle/phi/kernels/lstsq_kernel.h" -diff --git a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu -index 05a977828f..5136608c41 100644 ---- a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu -+++ b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu -@@ -58,7 +58,7 @@ void ShuffleBatchKernel(const Context& dev_ctx, - int64_t seed_int = 0; - if (seed.initialized()) { - const auto& seed_place = seed.place().GetType(); -- bool is_gpu_place = seed_place == phi::AllocationType::GPU; -+ bool is_gpu_place = seed_place == phi::AllocationType::GPU || seed_place == phi::AllocationType::CUSTOM; - if (is_gpu_place) { - // NOTE: We have overwritten GetKernelTypeForVar, so seed_place would - // not be CUDAPlace in practice. This case would only happen in Python diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h index 9bc5326c90..79b57a8203 100644 --- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h @@ -1144,32 +1105,6 @@ index 6f03f76eeb..5fe2c3e7dc 100644 #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" -diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h -index 7b85903776..3f4b298807 100644 ---- a/paddle/phi/kernels/impl/merged_momentum_impl.h -+++ b/paddle/phi/kernels/impl/merged_momentum_impl.h -@@ -297,7 +297,7 @@ void MergedMomentumInnerCompute( - params_out[idx], - velocities_out[idx]); - VLOG(10) << "Launch MergedMomentum cpu kernel."; -- } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { -+ } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) { - phi::funcs::ForRange for_range( - static_cast(dev_ctx), params[idx]->numel()); - const auto grad_type = grads[idx]->dtype(); -diff --git a/paddle/phi/kernels/impl/momentum_kernel_impl.h b/paddle/phi/kernels/impl/momentum_kernel_impl.h -index de5bcfc30b..eb2a9714f5 100644 ---- a/paddle/phi/kernels/impl/momentum_kernel_impl.h -+++ b/paddle/phi/kernels/impl/momentum_kernel_impl.h -@@ -457,7 +457,7 @@ void MomentumDenseImpl(const Context& dev_ctx, - regularization_coeff, - param_out, - velocity_out); -- } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { -+ } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) { - funcs::ForRange for_range(dev_ctx, param.numel()); - const auto grad_type = grad.dtype(); - #define PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(__nesterov, __reg_type) \ diff --git a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h index 4099d8b506..baef2cd643 100644 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h From 7303ae2c86253711559c2fe2f0abbc770541fe5e Mon Sep 17 00:00:00 2001 From: jxwangmetax <189149612@qq.com> Date: Mon, 29 Sep 2025 17:08:34 +0800 Subject: [PATCH 53/58] [metax] modify kernels (#73) * modify kernels --- .../kernels/impl/addmm_kernel_impl.h | 1 + backends/metax_gpu/patch/paddle.patch | 60 ++++++++++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h index fb1368b069c..b517b719d49 100644 --- a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h @@ -98,6 +98,7 @@ void AddmmKernel(const Context& dev_ctx, y_dims[0])); dev_ctx.template Alloc(out); + if (out->numel() == 0) return; auto blas = funcs::GetBlas(dev_ctx); // calc broadcast dim diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index 4c06609338c..69d714ef6e0 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -438,6 +438,21 @@ index d69eb67d6f..1d8b6e9375 100644 #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" +diff --git a/paddle/phi/kernels/funcs/embedding_grad.h b/paddle/phi/kernels/funcs/embedding_grad.h +index 461e6e2474..48a64ae9ce 100644 +--- a/paddle/phi/kernels/funcs/embedding_grad.h ++++ b/paddle/phi/kernels/funcs/embedding_grad.h +@@ -143,8 +143,8 @@ void LaunchEmbeddingGradDeterministicKernel(const GPUContext& dev_ctx, + constexpr int kWarpSize = 64; + constexpr int kBlockDimY = 16; + #else +- constexpr int kWarpSize = 32; +- constexpr int kBlockDimY = 32; ++ constexpr int kWarpSize = 64; ++ constexpr int kBlockDimY = 16; + #endif + dim3 threads(kWarpSize, kBlockDimY); + dim3 grids(static_cast((D + kWarpSize - 1) / kWarpSize)); diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu index cb35feee32..64f5bd24ac 100644 --- a/paddle/phi/kernels/funcs/fc_functor.cu @@ -501,6 +516,49 @@ index 15e1a4a3c3..e4780538d7 100644 #include "paddle/phi/kernels/funcs/im2col.h" namespace phi { +diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h +index e5361b836e..5ad238df08 100644 +--- a/paddle/phi/kernels/funcs/math_cuda_utils.h ++++ b/paddle/phi/kernels/funcs/math_cuda_utils.h +@@ -175,12 +175,12 @@ struct KeyValuePair { + #define WARP_SIZE_WIDTH_MASK 0x3f + typedef u_int64_t warp_mask_t; + #else +-#define FINAL_MASK 0xffffffff +-#define HALF_WARP 16 +-#define WARP_SIZE 32 +-#define WARP_SIZE_WIDTH 5 +-#define WARP_SIZE_WIDTH_MASK 0x1f +-typedef unsigned warp_mask_t; ++#define FINAL_MASK 0xffffffffffffffffUL ++#define HALF_WARP 32 ++#define WARP_SIZE 64 ++#define WARP_SIZE_WIDTH 6 ++#define WARP_SIZE_WIDTH_MASK 0x3f ++typedef u_int64_t warp_mask_t; + #endif + + template +@@ -200,19 +200,13 @@ __inline__ __device__ T BlockReduceSum(T val, warp_mask_t mask) { + static __shared__ T shared[WARP_SIZE]; + int lane = threadIdx.x & WARP_SIZE_WIDTH_MASK; + int wid = threadIdx.x >> WARP_SIZE_WIDTH; +- + val = WarpReduceSum(val, mask); +- +- __syncthreads(); + if (lane == 0) shared[wid] = val; +- + __syncthreads(); +- + // align block_span to warpSize + int block_span = (blockDim.x + warpSize - 1) >> WARP_SIZE_WIDTH; + val = (lane < block_span) ? shared[lane] : static_cast(0.0f); + val = WarpReduceSum(val, mask); +- + return val; + } + diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu index e101224970..a52eb6096f 100644 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu @@ -534,7 +592,7 @@ index 558d363b39..05da04b517 100644 #include "paddle/phi/kernels/funcs/scatter.cu.h" diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu -index 8b0baf5f5f..260482f124 100644 +index 047f52bd91..a05b34d3ba 100644 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu @@ -27,7 +27,7 @@ namespace cub = hipcub; From 8b184a32bd9e02c0d8b405d670a8e888a4522f42 Mon Sep 17 00:00:00 2001 From: jxwangmetax <189149612@qq.com> Date: Mon, 29 Sep 2025 18:11:03 +0800 Subject: [PATCH 54/58] [metax] modify kernels (#74) * modify kernels --- .../gpudnn/conv_grad_kernel_register.cu | 37 ++++++++----------- .../kernels/gpudnn/conv_kernel_register.cu | 19 +++++----- .../kernels/gpudnn/conv_transpose_kernel.cu | 15 ++++---- .../depthwise_conv_grad_kernel.cu | 14 +++---- .../metax_kernel/depthwise_conv_kernel.cu | 14 +++---- 5 files changed, 45 insertions(+), 54 deletions(-) diff --git a/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu index e4acb2f95b6..2da42c7ff8c 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu @@ -437,26 +437,22 @@ void ConvCudnnGradKernel(const Context& dev_ctx, dev_ctx.template Alloc(filter_grad); } - // bool has_use_addto = dev_ctx.HasDnnAttr("use_addto"); - bool has_use_addto = "true"; + bool has_use_addto = dev_ctx.HasDnnAttr("use_addto"); VLOG(4) << "GPUContext contains `use_addto`: " << has_use_addto; - // bool use_addto = has_use_addto - // ? PADDLE_GET_CONST(bool, "true") - // : false; - bool use_addto = "true"; + bool use_addto = has_use_addto + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("use_addto")) + : false; std::vector dilations = dilations_t; std::vector strides = strides_t; std::vector paddings = paddings_t; - // bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search"); - bool has_exhaustive_search = "true"; + bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search"); VLOG(4) << "GPUContext contains `exhaustive_search`: " << has_exhaustive_search; - // bool exhaustive_search_attr = - // has_exhaustive_search - // ? PADDLE_GET_CONST(bool, "true") - // : false; - bool exhaustive_search_attr = "true"; + bool exhaustive_search_attr = + has_exhaustive_search + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search")) + : false; bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_attr; bool deterministic = FLAGS_cudnn_deterministic; @@ -835,14 +831,13 @@ void ConvCudnnGradGradKernel( T* transformed_dx = nullptr; std::vector dilations = dilations_t; - // bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search"); - // VLOG(4) << "GPUContext contains `exhaustive_search`: " - // << has_exhaustive_search; - // bool exhaustive_search_attr = - // has_exhaustive_search - // ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search")) - // : false; - bool exhaustive_search_attr = "true"; + bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search"); + VLOG(4) << "GPUContext contains `exhaustive_search`: " + << has_exhaustive_search; + bool exhaustive_search_attr = + has_exhaustive_search + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search")) + : false; bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_attr; bool deterministic = FLAGS_cudnn_deterministic; diff --git a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu index 0a83b504c76..d6b243c956c 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu @@ -228,15 +228,16 @@ void ConvCudnnKernel(const Context& dev_ctx, std::vector paddings = paddings_t; std::vector dilations = dilations_t; - // bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search"); - // VLOG(4) << "GPUContext contains `exhaustive_search`: " - // << has_exhaustive_search; - // bool exhaustive_search_attr = - // has_exhaustive_search - // ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search")) - // : false; - - bool exhaustive_search = FLAGS_cudnn_exhaustive_search; + bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search"); + VLOG(4) << "GPUContext contains `exhaustive_search`: " + << has_exhaustive_search; + bool exhaustive_search_attr = + has_exhaustive_search + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search")) + : false; + + bool exhaustive_search = + FLAGS_cudnn_exhaustive_search || exhaustive_search_attr; bool deterministic = FLAGS_cudnn_deterministic; PADDLE_ENFORCE_EQ(exhaustive_search && deterministic, diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu index 532b7af0db4..4049d2f3130 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu @@ -260,14 +260,13 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx, return; } - // bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search"); - // bool exhaustive_search_attr = - // has_exhaustive_search - // ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search")) - // : false; - // bool exhaustive_search = - // FLAGS_cudnn_exhaustive_search || exhaustive_search_attr; - bool exhaustive_search = FLAGS_cudnn_exhaustive_search; + bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search"); + bool exhaustive_search_attr = + has_exhaustive_search + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search")) + : false; + bool exhaustive_search = + FLAGS_cudnn_exhaustive_search || exhaustive_search_attr; bool deterministic = FLAGS_cudnn_deterministic; PADDLE_ENFORCE_EQ(exhaustive_search && deterministic, diff --git a/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu index f2475298963..4e5f881385a 100644 --- a/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu +++ b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu @@ -54,14 +54,12 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, return; } - // bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv"); - // bool fuse_relu = - // has_fuse_relu - // ? PADDLE_GET_CONST( - // bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv")) - // : false; - bool has_fuse_relu = false; - bool fuse_relu = false; + bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv"); + bool fuse_relu = + has_fuse_relu + ? PADDLE_GET_CONST( + bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv")) + : false; std::vector strides = strides_t; std::vector paddings = paddings_t; diff --git a/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu index 517f26b1c02..d3d6c4a4edd 100644 --- a/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu +++ b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu @@ -48,14 +48,12 @@ void DepthwiseConvKernel(const Context& dev_ctx, const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - // bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv"); - // bool fuse_relu = - // has_fuse_relu - // ? PADDLE_GET_CONST( - // bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv")) - // : false; - bool has_fuse_relu = false; - bool fuse_relu = false; + bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv"); + bool fuse_relu = + has_fuse_relu + ? PADDLE_GET_CONST( + bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv")) + : false; if (channel_last) { PADDLE_ENFORCE_EQ( From 60f0ed637f73305e8f0fbd03917e3c8e2978d1ef Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Tue, 30 Sep 2025 11:33:54 +0800 Subject: [PATCH 55/58] [metax] link mccl and fix missing kernel (#76) * [metax] link mccl and fix missing kernel --- backends/metax_gpu/CMakeLists.txt | 7 + .../cross_entropy_bwd_w_downcast.cu | 291 ++++++++++++ .../embedding_grad_add_to_kernel.cu | 27 ++ .../cuda_kernels/gammaln_grad_kernel.cu | 28 ++ .../moe_combine_no_weight_grad_kernel.cu | 25 + .../cuda_kernels/multihead_matmul_kernel.cu | 433 ++++++++++++++++++ backends/metax_gpu/kernels/funcs/generator.cc | 287 ++++++++++++ .../kernels/impl/gammaln_grad_kernel_impl.h | 112 +++++ .../metax_kernel/cudnn_lstm_grad_kernel.cu | 362 +++++++++++++++ .../kernels/metax_kernel/cudnn_lstm_kernel.cu | 428 +++++++++++++++++ backends/metax_gpu/tests/ignore.txt | 4 + 11 files changed, 2004 insertions(+) create mode 100644 backends/metax_gpu/kernels/cuda_kernels/cross_entropy_bwd_w_downcast.cu create mode 100644 backends/metax_gpu/kernels/cuda_kernels/embedding_grad_add_to_kernel.cu create mode 100644 backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu create mode 100644 backends/metax_gpu/kernels/cuda_kernels/moe_combine_no_weight_grad_kernel.cu create mode 100644 backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu create mode 100644 backends/metax_gpu/kernels/funcs/generator.cc create mode 100644 backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h create mode 100644 backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_grad_kernel.cu create mode 100644 backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_kernel.cu diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index 5930eaaebd2..2bb282cf54f 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -326,6 +326,8 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/increment_kernel.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu + # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/cross_entropy_bwd_w_downcast.cu # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu @@ -728,6 +730,11 @@ target_link_libraries( ${WARPCTC_LIBRARIES} ${WARPRNNT_LIBRARIES} ${PADDLE_CORE_LIB}) + +target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so) +target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so) +target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcpti.so) + include_directories(BEFORE ${PADDLE_SOURCE_DIR}) target_compile_definitions( diff --git a/backends/metax_gpu/kernels/cuda_kernels/cross_entropy_bwd_w_downcast.cu b/backends/metax_gpu/kernels/cuda_kernels/cross_entropy_bwd_w_downcast.cu new file mode 100644 index 00000000000..a0d5dfd7a5a --- /dev/null +++ b/backends/metax_gpu/kernels/cuda_kernels/cross_entropy_bwd_w_downcast.cu @@ -0,0 +1,291 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/cross_entropy_grad_kernel.h" + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include "kernels/gpudnn/softmax_gpudnn.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_dnn.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/softmax.h" + +namespace phi { + +/* + Vectorized wrapper of softmax with cross entropy grad hard label. + Optimized with float4 vectorization for memory coalescing and improved + throughput. +*/ +template +__global__ void SoftmaxWithCrossEntropyGradHardLabelVectorized( + LogitT* __restrict__ logits_grad, + const T* __restrict__ loss_grad, + const T* __restrict__ softmax, + const LabelT* __restrict__ labels, + const int64_t n, + const int64_t dim, + const int64_t d, + const int ignore_index) { + // Vectorized load/store with float4 for 128-bit memory transactions + constexpr int VEC_SIZE = 4; + using VecT = typename phi::AlignedVector; + using SoftmaxVecT = typename phi::AlignedVector; + + int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + int64_t vec_id = tid * VEC_SIZE; + + // Ensure we don't exceed bounds + if (vec_id >= n * dim * d) return; + + // Compute indices for vectorized access + int64_t idx_n = vec_id / (d * dim); + int64_t idx_dim_start = (vec_id / d) % dim; + int64_t idx_d = vec_id % d; + int64_t ids = idx_n * d + idx_d; + + // Load label once per thread + auto lbl = static_cast(labels[ids]); + + if (lbl == ignore_index) { + // Vectorized zero fill for ignore_index + VecT* vec_grad = reinterpret_cast(&logits_grad[vec_id]); + VecT zero_vec; +#pragma unroll + for (int i = 0; i < VEC_SIZE; ++i) { + zero_vec.val[i] = static_cast(0.0f); + } + *vec_grad = zero_vec; + return; + } + + // Vectorized load of softmax values + SoftmaxVecT softmax_vec; + const SoftmaxVecT* softmax_ptr = + reinterpret_cast(&softmax[vec_id]); + softmax_vec = *softmax_ptr; + + // Load loss gradient (broadcast across vector elements) + T loss_grad_val = loss_grad[ids]; + + // Vectorized computation + VecT grad_vec; +#pragma unroll + for (int i = 0; i < VEC_SIZE; ++i) { + int64_t current_dim = idx_dim_start + i; + if (current_dim < dim) { // Bounds check for partial vectors + float softmax_val = static_cast(softmax_vec.val[i]); + float grad_val; + + if (lbl == current_dim) { + grad_val = (softmax_val - 1.0f) * static_cast(loss_grad_val); + } else { + grad_val = softmax_val * static_cast(loss_grad_val); + } + + grad_vec.val[i] = static_cast(grad_val); + } else { + grad_vec.val[i] = static_cast(0.0f); + } + } + + // Vectorized store + VecT* grad_ptr = reinterpret_cast(&logits_grad[vec_id]); + *grad_ptr = grad_vec; +} + +/* + Specialized kernel for dimensions not divisible by vector size + Uses warp-level primitives for better performance on irregular sizes +*/ +template +__global__ void SoftmaxWithCrossEntropyGradHardLabelWarp( + LogitT* __restrict__ logits_grad, + const T* __restrict__ loss_grad, + const T* __restrict__ softmax, + const LabelT* __restrict__ labels, + const int64_t n, + const int64_t dim, + const int64_t d, + const int ignore_index) { + const int warps_per_block = 4; + const int threads_per_warp = 32; + const int threads_per_block = warps_per_block * threads_per_warp; + + int tid = blockIdx.x * threads_per_block + threadIdx.x; + int warp_id = threadIdx.x / threads_per_warp; + int lane_id = threadIdx.x % threads_per_warp; + + // Process multiple elements per thread using warp-level parallelism + int64_t elements_per_thread = + (n * dim * d + gridDim.x * threads_per_block - 1) / + (gridDim.x * threads_per_block); + + for (int e = 0; e < elements_per_thread; ++e) { + int64_t idx = tid + e * gridDim.x * threads_per_block; + if (idx >= n * dim * d) break; + + int64_t idx_n = idx / (d * dim); + int64_t idx_dim = (idx / d) % dim; + int64_t idx_d = idx % d; + int64_t ids = idx_n * d + idx_d; + + auto lbl = static_cast(labels[ids]); + + if (lbl == ignore_index) { + logits_grad[idx] = static_cast(0.0f); + } else if (lbl == idx_dim) { + logits_grad[idx] = + static_cast((static_cast(softmax[idx]) - 1.0f) * + static_cast(loss_grad[ids])); + } else { + logits_grad[idx] = + static_cast(static_cast(softmax[idx]) * + static_cast(loss_grad[ids])); + } + } +} + +/* + Optimized kernel selector based on problem size and alignment +*/ +template +void LaunchOptimizedCrossEntropyGradKernel(const GPUContext& dev_ctx, + LogitT* logits_grad, + const T* loss_grad, + const T* softmax, + const LabelT* labels, + const int64_t n, + const int64_t dim, + const int64_t d, + const int ignore_index) { + const int64_t total_elements = n * dim * d; + auto stream = dev_ctx.stream(); + + // Check alignment for vectorized kernel + bool is_aligned = (reinterpret_cast(logits_grad) % 16 == 0) && + (reinterpret_cast(softmax) % 16 == 0) && + (total_elements % 4 == 0); + + if (is_aligned && total_elements >= 1024) { + // Use vectorized kernel for aligned, large problems + constexpr int VEC_SIZE = 4; + const int threads_per_block = 256; + const int vec_elements = total_elements / VEC_SIZE; + const int blocks = + (vec_elements + threads_per_block - 1) / threads_per_block; + + SoftmaxWithCrossEntropyGradHardLabelVectorized + <<>>( + logits_grad, loss_grad, softmax, labels, n, dim, d, ignore_index); + } else { + // Use warp-specialized kernel for irregular sizes + const int warps_per_block = 4; + const int threads_per_block = warps_per_block * 32; + const int blocks = + std::min(1024, + static_cast((total_elements + threads_per_block - 1) / + threads_per_block)); + + SoftmaxWithCrossEntropyGradHardLabelWarp + <<>>( + logits_grad, loss_grad, softmax, labels, n, dim, d, ignore_index); + } +} + +template +void CrossEntropyWithSoftmaxBwdWithDowncastGPUKernel( + const GPUContext& dev_ctx, + const DenseTensor& label, + const DenseTensor& softmax, + const DenseTensor& loss_grad, + int axis, + DenseTensor* logits_grad) { + // PADDLE_ENFORCE_EQ( + // dev_ctx.GetPlace().GetType(), + // phi::AllocationType::GPU, + // common::errors::Unavailable("softmax_with_cross_entropy operator's " + // "CUDA kernel only runs on GPU device.")); + + using LogitT = phi::bfloat16; + const T* loss_grad_data = loss_grad.data(); + DenseTensor* logit_grad = logits_grad; + + LogitT* logit_grad_data = nullptr; + logit_grad_data = dev_ctx.template Alloc(logit_grad); + + const int rank = logit_grad->dims().size(); + const int axis_v = phi::funcs::CanonicalAxis(axis, rank); + int axis_dim = logit_grad->dims()[axis_v]; + + const int64_t n = phi::funcs::SizeToAxis(axis_v, logit_grad->dims()); + const int64_t d = phi::funcs::SizeFromAxis(axis_v, logit_grad->dims()); + const int64_t remain = d / axis_dim; + + const T* softmax_data = softmax.data(); + const auto* label_data = label.data(); + + // Launch optimized kernel with automatic selection + LaunchOptimizedCrossEntropyGradKernel(dev_ctx, + logit_grad_data, + loss_grad_data, + softmax_data, + label_data, + n, + axis_dim, + remain, + -100); +} + +template +void CrossEntropyWithSoftmaxBwdWithDowncastKernel(const Context& dev_ctx, + const DenseTensor& label, + const DenseTensor& softmax, + const DenseTensor& loss_grad, + DenseTensor* logits_grad) { + constexpr int axis = -1; + if (logits_grad->numel() == 0) { + dev_ctx.template Alloc(logits_grad); + return; + } + auto dtype = label.dtype(); + PD_VISIT_INTEGRAL_TYPES( + dtype, "CrossEntropyWithSoftmaxBwdWithDowncastGPUKernel", ([&] { + CrossEntropyWithSoftmaxBwdWithDowncastGPUKernel( + dev_ctx, label, softmax, loss_grad, axis, logits_grad); + })); +} + +} // namespace phi + +PD_REGISTER_PLUGIN_KERNEL(cross_entropy_with_softmax_bwd_w_downcast, + metax_gpu, + ALL_LAYOUT, + phi::CrossEntropyWithSoftmaxBwdWithDowncastKernel, + float, + double, + phi::float16) {} diff --git a/backends/metax_gpu/kernels/cuda_kernels/embedding_grad_add_to_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/embedding_grad_add_to_kernel.cu new file mode 100644 index 00000000000..6b20feee0fd --- /dev/null +++ b/backends/metax_gpu/kernels/cuda_kernels/embedding_grad_add_to_kernel.cu @@ -0,0 +1,27 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/embedding_grad_kernel.h" +#include "paddle/phi/kernels/funcs/embedding_grad.h" +#include "paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu" // NOLINT + +PD_CUSTOM_KERNEL_REGISTER(embedding_grad_add_to, + metax_gpu, + ALL_LAYOUT, + phi::EmbeddingGradAddToAddToKernel, + float, + double, + phi::float16, + phi::bfloat16) {} diff --git a/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu new file mode 100644 index 00000000000..c6bd53f007f --- /dev/null +++ b/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu @@ -0,0 +1,28 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "kernels/impl/gammaln_grad_kernel_impl.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gammaln_grad_kernel.h" + +PD_REGISTER_PLUGIN_KERNEL(gammaln_grad, + metax_gpu, + ALL_LAYOUT, + phi::GammalnGradKernel, + float, + double, + phi::float16, + phi::bfloat16) {} diff --git a/backends/metax_gpu/kernels/cuda_kernels/moe_combine_no_weight_grad_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/moe_combine_no_weight_grad_kernel.cu new file mode 100644 index 00000000000..e6984cf86d2 --- /dev/null +++ b/backends/metax_gpu/kernels/cuda_kernels/moe_combine_no_weight_grad_kernel.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/legacy/gpu/moe_combine_no_weight_grad_kernel.cu" // NOLINT + +PD_CUSTOM_KERNEL_REGISTER(moe_combine_no_weight_grad, + metax_gpu, + ALL_LAYOUT, + phi::MoeCombineNoWeightGradKernel, + float, + double, + phi::bfloat16, + phi::float16) {} diff --git a/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu new file mode 100644 index 00000000000..151c929e41c --- /dev/null +++ b/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu @@ -0,0 +1,433 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "kernels/funcs/blas/blas.h" +#include "paddle/common/errors.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/funcs/multihead_matmul_functor.h" + +namespace phi { +namespace fusion { + +template +__global__ void transpose(T *src, + T *dst, + const int batch_size, + const int seq_len, + const int head_num, + const int size_per_head) { + int batch_id = blockIdx.x / (head_num * seq_len); + int seq_id = blockIdx.x % seq_len; + int head_id = (blockIdx.x % (head_num * seq_len)) / seq_len; + dst[batch_id * (head_num * seq_len * size_per_head) + + seq_id * head_num * size_per_head + head_id * size_per_head + + threadIdx.x] = src[blockIdx.x * size_per_head + threadIdx.x]; +} + +template +inline __device__ T add_func(T a, T b); + +template <> +__device__ float add_func(float a, float b) { + return a + b; +} + +template <> +__device__ float2 add_func(float2 a, float2 b) { + float2 c; + c.x = a.x + b.x; + c.y = a.y + b.y; + return c; +} + +template <> +__device__ float4 add_func(float4 a, float4 b) { + float4 c; + c.x = a.x + b.x; + c.y = a.y + b.y; + c.z = a.z + b.z; + c.w = a.w + b.w; + return c; +} +#if defined(PADDLE_WITH_CUDA) +template <> +__device__ half2 add_func(half2 a, half2 b) { +#if __CUDA_ARCH__ >= 530 + return __hadd2(a, b); +#else + return half2(__float2half(__half2float(a.x) + __half2float(b.x)), + __float2half(__half2float(b.x) + __half2float(b.y))); +#endif +} + +template <> +__device__ half add_func(half a, half b) { +#if __CUDA_ARCH__ >= 530 + return __hadd(a, b); +#else + return __float2half(__half2float(a) + __half2float(b)); +#endif +} +#endif + +template +__global__ void TransposeQkvKernel(const int H, + const T *input, + const T *bias, + T *output) { + // Input: BxSx3xNxH + // Bias: 3xNxH + // Output: 3xBxNxSxH + int n = threadIdx.y; + int s = blockIdx.x; + int b = blockIdx.y; + int m = blockIdx.z; + + const int N = blockDim.y; + const int S = gridDim.x; + const int B = gridDim.y; + + const int NH = N * H; + const int NHS = NH * S; + const int in_offset = n * H + m * NH + s * 3 * NH + b * NHS * 3; + const int bias_offset = m * NH + n * H; + const int out_offset = s * H + n * S * H + b * NHS + m * NHS * B; + + const int i = threadIdx.x; + output[out_offset + i] = + add_func(input[in_offset + i], bias[bias_offset + i]); +} + +template +void TransQKVWithBias(const int batch, + const int seq_len, + const int head_size, + const int head_num, + const T *input, + const T *bias, + T *output, + gpuStream_t stream); + +template <> +void TransQKVWithBias(const int batch, + const int seq_len, + const int head_size, + const int head_num, + const float *input, + const float *bias, + float *output, + gpuStream_t stream) { + // BxSx3xNxH + 3xNxH -> 3xBxNxSxH + int scratch_size = batch * head_num * seq_len * seq_len; + const dim3 grid(seq_len, batch, 3); + // scratch % 4 == 0 to ensure the alignment + if (head_size % 4 == 0 && scratch_size % 4 == 0) { + const int h = head_size / 4; + const float4 *input4 = reinterpret_cast(input); + const float4 *bias4 = reinterpret_cast(bias); + float4 *output4 = reinterpret_cast(output); + const dim3 block(h, head_num, 1); + + // limit h * head_num to max block size(1024). + PADDLE_ENFORCE_LE(h * head_num, + 1024, + common::errors::InvalidArgument( + "head_num (%d) * head_size (%d) should <= %d", + head_num, + head_size, + 1024 * 4)); + TransposeQkvKernel + <<>>(h, input4, bias4, output4); + } else if (head_size % 2 == 0 && scratch_size % 2 == 0) { + const int h = head_size / 2; + const float2 *input2 = reinterpret_cast(input); + const float2 *bias2 = reinterpret_cast(bias); + float2 *output2 = reinterpret_cast(output); + const dim3 block(h, head_num, 1); + // limit h * head_num to max block size(1024). + PADDLE_ENFORCE_LE(h * head_num, + 1024, + common::errors::InvalidArgument( + "head_num (%d) * head_size (%d) should <= %d", + head_num, + head_size, + 1024 * 2)); + TransposeQkvKernel + <<>>(h, input2, bias2, output2); + } else { + const dim3 block(head_size, head_num, 1); + // limit head_size * head_num to max block size(1024). + PADDLE_ENFORCE_LE(head_size * head_num, + 1024, + common::errors::InvalidArgument( + "head_num (%d) * head_size (%d) should <= %d", + head_num, + head_size, + 1024)); + TransposeQkvKernel + <<>>(head_size, input, bias, output); + } +} + +#if defined(PADDLE_WITH_CUDA) +template <> +void TransQKVWithBias(const int batch, + const int seq_len, + const int head_size, + const int head_num, + const phi::float16 *input, + const phi::float16 *bias, + phi::float16 *output, + gpuStream_t stream) { + // BxSx3xNxH + 3xNxH -> 3xBxNxSxH + int scratch_size = batch * head_num * seq_len * seq_len; + const dim3 grid(seq_len, batch, 3); + if (head_size % 2 == 0 && scratch_size % 2 == 0) { + const int h = head_size / 2; + const half2 *input2 = reinterpret_cast(input); + const half2 *bias2 = reinterpret_cast(bias); + half2 *output2 = reinterpret_cast(output); + const dim3 block(h, head_num, 1); + // limit h * head_num to max block size(1024). + PADDLE_ENFORCE_LE(h * head_num, + 1024, + common::errors::InvalidArgument( + "head_num (%d) * head_size (%d) should <= %d", + head_num, + head_size, + 1024 * 2)); + TransposeQkvKernel + <<>>(h, input2, bias2, output2); + } else { + const dim3 block(head_size, head_num, 1); + const half *input_half = reinterpret_cast(input); + const half *bias_half = reinterpret_cast(bias); + half *output_half = reinterpret_cast(output); + + // limit head_size * head_num to max block size(1024). + PADDLE_ENFORCE_LE(head_size * head_num, + 1024, + common::errors::InvalidArgument( + "head_num (%d) * head_size (%d) should <= %d", + head_num, + head_size, + 1024)); + TransposeQkvKernel<<>>( + head_size, input_half, bias_half, output_half); + } +} +#endif + +inline int round_up(int seq_len, int multiple = 32) { + PADDLE_ENFORCE_GT( + multiple, + 0, + common::errors::InvalidArgument( + "multiple should be a positive number, but it's (%d)", multiple)); + return ((seq_len + multiple - 1) / multiple) * multiple; +} + +template +__global__ void broadcast(const T *src, + T *dst, + const int seq_len, + const int head_num) { + int batch_id = blockIdx.x / (head_num * seq_len); + int dst_offset = blockIdx.x * seq_len; + if (threadIdx.x < seq_len) { + dst[threadIdx.x + dst_offset] = src[threadIdx.x + batch_id * seq_len]; + } +} + +template +__global__ void broadcast_batch_head_number(const T *src, + T *dst, + const int batch_size, + const int seq_len, + const int head_num) { + int src_seq_id = blockIdx.x % seq_len; + int dst_offset = blockIdx.x * seq_len; + if (threadIdx.x < seq_len) { + dst[threadIdx.x + dst_offset] = src[threadIdx.x + src_seq_id * seq_len]; + } +} + +template +void MultiheadMatmulKernel(const Context &dev_ctx, + const DenseTensor &input, + const DenseTensor &w, + const DenseTensor &bias, + const paddle::optional &bias_qk, + const bool transpose_q, + const bool transpose_k, + const bool transpose_v, + const float alpha, + const int head_number, + DenseTensor *out) { + auto *input_d = input.data(); + auto *w_d = w.data(); + auto *bias_d = bias.data(); + auto *bias_qk_d = bias_qk ? bias_qk->data() : nullptr; + T scale = static_cast(alpha); + + // compute q*k with eltadd + auto stream = dev_ctx.stream(); + // should be (B * S * hidden) + auto input_dims = input.dims(); + // shouble be (hidden * 3 * all_head_size) + auto w_dims = w.dims(); + int batch = input_dims[0]; + int seq_len = input_dims[1]; + int hidden = input_dims[2]; + phi::DenseTensor temp_bias_tensor; + // if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted + if (bias_qk && bias_qk->numel() == (batch * seq_len)) { + VLOG(4) << "Do broadcasted bias_qk from [batch, 1, 1, seq_len]"; + temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len}); + auto *temp_qk_bias = dev_ctx.template Alloc( + &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T)); + int grid = batch * head_number * seq_len; + int block = round_up(seq_len); + broadcast<<>>( + bias_qk_d, temp_qk_bias, seq_len, head_number); + bias_qk_d = static_cast(temp_qk_bias); + } + // if bias_qk is[1, 1, seq_len, seq_len], the bias_qk_d need to be + // broadcasted + if (bias_qk && bias_qk->numel() == (1 * seq_len * seq_len)) { + VLOG(4) << "do broadcasted bias_qk from [1, 1, seq_len, seq_len]"; + temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len}); + auto *temp_qk_bias = dev_ctx.template Alloc( + &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T)); + int grid = batch * head_number * seq_len; + int block = round_up(seq_len); + broadcast_batch_head_number<<>>( + bias_qk_d, temp_qk_bias, batch, seq_len, head_number); + bias_qk_d = static_cast(temp_qk_bias); + } + if (!bias_qk) { + int size = batch * head_number * seq_len * seq_len; + temp_bias_tensor.Resize({size}); + auto *temp_qk_bias = dev_ctx.template Alloc( + &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T)); +#ifdef PADDLE_WITH_HIP + hipMemset(temp_qk_bias, 0, sizeof(float) * size); +#else + cudaMemset(temp_qk_bias, 0, sizeof(float) * size); +#endif + bias_qk_d = static_cast(temp_qk_bias); + } + int all_head_size = w_dims[2]; + int head_size = all_head_size / head_number; + + out->Resize({batch, seq_len, all_head_size}); + auto *output_d = dev_ctx.template Alloc(out, out->numel() * sizeof(T)); + + // (B*S, hidden) + const phi::DenseTensor input_matrix = + phi::ReshapeToMatrix(input, 2 /*x_num_col_dims */); + // (hidden, 3 * all_head_size) + const phi::DenseTensor w_matrix = + phi::ReshapeToMatrix(w, 1 /*y_num_col_dims*/); + + phi::DenseTensor temp_out_tensor; + auto temp_out_dims = + common::make_ddim({batch, seq_len, 3, head_number, head_size}); + temp_out_tensor.Resize( + {batch * seq_len, common::product(temp_out_dims) / (batch * seq_len)}); + auto *temp_out_data = dev_ctx.template Alloc( + &temp_out_tensor, temp_out_tensor.numel() * sizeof(T)); + + // (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H) + auto blas = phi::funcs::GetBlas(dev_ctx); + blas.MatMul(input_matrix, w_matrix, &temp_out_tensor); + VLOG(2) << "(B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)"; + // temp_out_tensor.Resize(temp_out_dims); + + phi::DenseTensor multihead_temp_tensor; + // B * head_number * S * S * 1 + B * S * 3 * N * H + int scratch_size = batch * head_number * seq_len * seq_len * 1; + multihead_temp_tensor.Resize({scratch_size + temp_out_tensor.numel()}); + auto *multihead_temp_data = dev_ctx.template Alloc( + &multihead_temp_tensor, multihead_temp_tensor.numel() * sizeof(T)); + + auto *qkptr = multihead_temp_data; + auto *tptr = multihead_temp_data + scratch_size; + + // Do the transpose with bias. + // BxSx3xNxH => tptr: 3xBxNxSxH. + TransQKVWithBias(batch, + seq_len, + head_size, + head_number, + temp_out_data, + bias_d, + tptr, + stream); + if (std::is_same::value) { + phi::funcs::MultiheadGPUComputeFunctor multihead_compute_func; + multihead_compute_func(dev_ctx, + batch, + seq_len, + head_number, + head_size, + reinterpret_cast(qkptr), + reinterpret_cast(bias_qk_d), + false, + reinterpret_cast(tptr), + __float2half(static_cast(scale)), + __float2half(0.0)); + } else { + phi::funcs::MultiheadGPUComputeFunctor multihead_compute_func; + multihead_compute_func(dev_ctx, + batch, + seq_len, + head_number, + head_size, + qkptr, + bias_qk_d, + false, + tptr, + scale, + T(0.0)); + } + + int grid = batch * head_number * seq_len; + int block = head_size; + transpose<<>>( + tptr, output_d, batch, seq_len, head_number, head_size); +} + +} // namespace fusion +} // namespace phi + +#if defined(PADDLE_WITH_CUDA) +PD_REGISTER_PLUGIN_KERNEL(multihead_matmul, + metax_gpu, + ALL_LAYOUT, + phi::fusion::MultiheadMatmulKernel, + float, + phi::float16) {} +#else +PD_REGISTER_PLUGIN_KERNEL(multihead_matmul, + metax_gpu, + ALL_LAYOUT, + phi::fusion::MultiheadMatmulKernel, + float) {} +#endif diff --git a/backends/metax_gpu/kernels/funcs/generator.cc b/backends/metax_gpu/kernels/funcs/generator.cc new file mode 100644 index 00000000000..8fcbf474b07 --- /dev/null +++ b/backends/metax_gpu/kernels/funcs/generator.cc @@ -0,0 +1,287 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/generator.h" + +#include + +#include +#include +#include + +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/xpu/xpu_info.h" +#include "paddle/phi/core/enforce.h" + +static uint64_t GetRandomSeed() { + std::random_device rd; + // double has 53 bit significant, so limit uint64 to 53 bits + return ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF; +} + +namespace phi { + +const std::shared_ptr& DefaultXPUGenerator(int64_t device_id) { +#if defined(PADDLE_WITH_XPU) + + static int64_t num_xpu_devices = -1; + static std::once_flag num_devices_init_flag; + static std::deque xpu_device_flags; + static std::vector> default_xpu_generators; + + std::call_once(num_devices_init_flag, []() { + num_xpu_devices = phi::backends::xpu::GetXPUDeviceCount(); + xpu_device_flags.resize(num_xpu_devices); + default_xpu_generators.resize(num_xpu_devices); + }); + if (device_id < 0) { + PADDLE_THROW(common::errors::InvalidArgument( + "xpu device id should be greater than 0")); + } + + std::call_once(xpu_device_flags[device_id], [device_id]() { + default_xpu_generators[device_id] = + std::make_shared(GetRandomSeed(), device_id); + VLOG(4) << "initial seed: " + << default_xpu_generators[device_id]->GetCurrentSeed(); + }); + return default_xpu_generators[device_id]; +#else + PADDLE_THROW(common::errors::PermissionDenied( + "getDefaultXPUGenerator only support in XPU place")); +#endif +} + +const std::shared_ptr& DefaultCUDAGenerator(int64_t device_id) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + + static int64_t num_cuda_devices = -1; + static std::once_flag num_devices_init_flag; + static std::deque cuda_device_flags; + static std::vector> default_cuda_generators; + + std::call_once(num_devices_init_flag, []() { + num_cuda_devices = phi::backends::gpu::GetGPUDeviceCount(); + cuda_device_flags.resize(num_cuda_devices); + default_cuda_generators.resize(num_cuda_devices); + }); + if (device_id < 0) { + PADDLE_THROW(common::errors::InvalidArgument( + "cuda device id should be greater than 0")); + } + + std::call_once(cuda_device_flags[device_id], [device_id]() { + default_cuda_generators[device_id] = + std::make_shared(GetRandomSeed(), device_id); + VLOG(7) << "initial seed: " + << default_cuda_generators[device_id]->GetCurrentSeed(); + }); + return default_cuda_generators[device_id]; +#else + PADDLE_THROW(common::errors::PermissionDenied( + "getDefaultCUDAGenerator only support in CUDA place")); +#endif +} + +const std::shared_ptr& DefaultCPUGenerator() { + static auto default_cpu_generator = + std::make_shared(GetRandomSeed()); + return default_cpu_generator; +} + +const std::shared_ptr& DefaultCustomDeviceGenerator( + const phi::CustomPlace& place) { + static std:: + unordered_map, phi::Place::Hash> + generators; + if (generators.find(place) == generators.end()) { + generators.insert({place, std::make_shared(GetRandomSeed())}); + } + return generators[place]; +} + +using RNGMap = std::unordered_map>; + +static RNGMap& GetRandomSeedGeneratorMap() { + static auto random_seed_generator_map = RNGMap(); + return random_seed_generator_map; +} + +const std::shared_ptr& SetRandomSeedGenerator( + const std::string& name, uint64_t seed) { + auto& rng_map = GetRandomSeedGeneratorMap(); + auto iter = rng_map.find(name); + PADDLE_ENFORCE_EQ(iter == rng_map.end(), + true, + common::errors::AlreadyExists( + "%s RandomSeedGenerator is already exist", name)); + + auto generator = std::make_shared(seed); + bool emplace_success = rng_map.emplace(name, generator).second; + PADDLE_ENFORCE_EQ( + emplace_success, + true, + common::errors::PermissionDenied( + "SetRandomSeedGenerator cannot emplace %s RandomSeedGenerator", + name)); + return rng_map[name]; +} + +const std::shared_ptr& GetRandomSeedGenerator( + const std::string& name) { + auto& rng_map = GetRandomSeedGeneratorMap(); + auto iter = rng_map.find(name); + PADDLE_ENFORCE_EQ(iter != rng_map.end(), + true, + common::errors::NotFound( + "%s RandomSeedGenerator is not found, please " + "use `set_random_seed_generator` to set rng first", + name)); + return iter->second; +} + +// There are 3 conditions: +// (1) op seed is set, use op seed. +// (2) op seed is not set, global seed is set, use global seed. +// (3) op seed is not set, global seed is not set too, use random seed from +// RandomGenerator. +std::shared_ptr GetCPURandomEngine(uint64_t seed) { + if (seed == 0) { + VLOG(4) << "Use random cpu_engine from generator"; + return DefaultCPUGenerator()->GetCPUEngine(); + } else { + // NOTE(zhiqiu): creating an cpu_engine instance everytime instead of using + // OpDefaultCPUEngine(), this is the legacy behavior of random operators. + // The benefit is that when running PE with fixed-seed in multiple threads, + // each thread has their own cpu_engine, and doesn't affect each other. + // + // And we need to measure the determinacy of Generator in PE. + auto cpu_engine = std::make_shared(); + static std::mutex mu_; + { + std::lock_guard lock(mu_); + cpu_engine->seed(seed); + } + return cpu_engine; + } +} + +inline void Generator::print_state_info() { + VLOG(7) << "Generator Random state " + << "device id: " << state().device << ", seed: " << state().seed + << ", offset: " << state().offset << ", cpu_engine: " << cpu_engine(); +} + +Generator::Generator() { + auto seed = GetRandomSeed(); + current_index = states_.size(); + states_.emplace_back(-1, seed); + print_state_info(); +} + +Generator::Generator(uint64_t seed) { + current_index = states_.size(); + states_.emplace_back(-1, seed); + print_state_info(); +} + +Generator::Generator(uint64_t seed, int64_t device_id) { + current_index = states_.size(); + // device id first, then seed + states_.emplace_back(device_id, seed); + print_state_info(); +} + +phi::Generator::GeneratorState Generator::GetState() { return state(); } + +void Generator::SetState(const phi::Generator::GeneratorState& state) { + std::lock_guard lock(mu_); + if (current_index < states_.size()) + states_[current_index] = state; + else + PADDLE_THROW(common::errors::NotFound("Generator index is not found")); + print_state_info(); +} + +uint64_t Generator::GetStateIndex() { return current_index; } + +void Generator::SetStateIndex(uint64_t StateIndex) { + std::lock_guard lock(mu_); + if (current_index < states_.size()) + current_index = StateIndex; + else + PADDLE_THROW(common::errors::NotFound("Generator index is not found")); +} + +uint64_t Generator::RegisterStateIndex(const GeneratorState& state) { + std::lock_guard lock(mu_); + auto new_index = states_.size(); + states_.push_back(state); + current_index = new_index; + return new_index; +} + +inline Generator::GeneratorState& Generator::state() { + if (current_index < states_.size()) + return states_[current_index]; + else + PADDLE_THROW(common::errors::NotFound("Generator index is not found")); +} + +inline std::shared_ptr Generator::cpu_engine() { + return state().cpu_engine; +} + +uint64_t Generator::GetCurrentSeed() { + std::lock_guard lock(mu_); + return state().seed; +} + +uint64_t Generator::Seed() { + std::lock_guard lock(mu_); + uint64_t seed = GetRandomSeed(); + state().reset(seed); + return seed; +} + +void Generator::SetCurrentSeed(uint64_t seed) { + std::lock_guard lock(mu_); + state().reset(seed); +} + +std::shared_ptr Generator::GetCPUEngine() { + return cpu_engine(); +} + +uint64_t Generator::Random64() { + std::lock_guard lock(mu_); + auto current_engine = cpu_engine(); + return (*current_engine)(); +} + +std::pair Generator::IncrementOffset(uint64_t increment) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU) + std::lock_guard lock(mu_); + uint64_t offset = state().offset; + state().offset = offset + increment; + print_state_info(); + return std::make_pair(state().seed, offset); +#else + PADDLE_THROW(common::errors::PermissionDenied( + "Increment Offset only support in CUDA place")); +#endif +} + +} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h new file mode 100644 index 00000000000..2b222ba3b2c --- /dev/null +++ b/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h @@ -0,0 +1,112 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/kernels/funcs/for_range.h" + +namespace phi { +template +HOSTDEVICE T digamma_positive_domain(T x) { + constexpr T c = T{8.5}; + constexpr T euler_mascheroni = T{0.57721566490153286060}; + T r; + T value; + T x2; + + if (x <= T{0.000001}) { + value = -euler_mascheroni - T{1.0} / x + T{1.6449340668482264365} * x; + return value; + } + + value = T{0.0}; + x2 = x; + while (x2 < c) { + value = value - T{1.0} / x2; // NOLINT + x2 = x2 + T{1.0}; + } + + r = T{1.0} / x2; + value = value + std::log(x2) - T{0.5} * r; + + r = r * r; + + value = value - + r * (T{1.0} / T{12.0} - + r * (T{1.0} / T{120.0} - + r * (T{1.0} / T{252.0} - + r * (T{1.0} / T{240.0} - r * (T{1.0} / T{132.0}))))); + + return value; +} + +template +HOSTDEVICE T digamma(T x) { + const static T pi = T{3.14159265358979323846}; // NOLINT + + if (x == T{0.0}) { + T inf = std::numeric_limits::infinity(); + return std::signbit(x) ? inf : -inf; + } else if (x < T{0.0}) { + if (x == std::trunc(x)) { + return std::numeric_limits::quiet_NaN(); + } else { + T iptr; + T frac_part = std::modf(x, &iptr); + return digamma_positive_domain(T{1.0} - x) - + pi / std::tan(pi * frac_part); + } + } else { + return digamma_positive_domain(x); + } +} + +template +struct GammalnGradFunctor { + GammalnGradFunctor(const T* dout, const T* x, T* output, int64_t numel) + : dout_(dout), x_(x), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + using MT = typename phi::dtype::MPTypeTrait::Type; + const MT mp_dout = static_cast(dout_[idx]); + const MT mp_x = static_cast(x_[idx]); + output_[idx] = static_cast(mp_dout * digamma(mp_x)); + } + + private: + const T* dout_; + const T* x_; + T* output_; + int64_t numel_; +}; +template +void GammalnGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& d_out, + DenseTensor* d_x) { + auto numel = d_out.numel(); + if (d_x && d_x->numel() == 0) { + dev_ctx.template Alloc(d_x); + return; + } + auto* dout_data = d_out.data(); + auto* x_data = x.data(); + auto* dx_data = + dev_ctx.template Alloc(d_x, static_cast(numel * sizeof(T))); + phi::funcs::ForRange for_range(dev_ctx, numel); + GammalnGradFunctor functor(dout_data, x_data, dx_data, numel); + for_range(functor); +} +} // namespace phi diff --git a/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_grad_kernel.cu new file mode 100644 index 00000000000..766d984a25b --- /dev/null +++ b/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_grad_kernel.cu @@ -0,0 +1,362 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "kernels/metax_kernel/metax_context.h" //NOLINT +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cudnn_lstm_grad_kernel.h" +#include "paddle/phi/kernels/gpu/cudnn_lstm_utils.h" + +namespace phi { + +template +void CudnnLSTMGradKernel( + const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &init_h, + const DenseTensor &init_c, + const paddle::optional> &weight_list, + const paddle::optional &sequence_length, + const DenseTensor &out, + const DenseTensor &reserve, + const DenseTensor &state_out, + const DenseTensor &out_grad, + const DenseTensor &last_h_grad, + const DenseTensor &last_c_grad, + float dropout_prob, + bool is_bidirec, + int hidden_size, + int num_layers, + bool is_test, + int seed, + DenseTensor *x_grad, + DenseTensor *init_h_grad, + DenseTensor *init_c_grad, + std::vector weight_grad_list) { + auto input_dims = x.dims(); + auto init_h_dims = init_h.dims(); + auto init_c_dims = init_c.dims(); + + auto *init_h_data = init_h.data(); + auto *init_c_data = init_c.data(); + auto *out_data = out.data(); + auto *out_grad_data = out_grad.data(); + auto *last_h_grad_data = last_h_grad.data(); + auto *last_c_grad_data = last_c_grad.data(); + + auto running_weight_list = *weight_list.get_ptr(); + int weight_numel = size_sum(running_weight_list); + bool continuous = is_continuous>( + running_weight_list); + + // auto handle = dev_ctx.cudnn_handle(); + auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); + auto place = dev_ctx.GetPlace(); + auto stream = dev_ctx.stream(); + phi::DenseTensor weight_whole; + T *weight_data = nullptr; + + if (!continuous) { + weight_whole.Resize({weight_numel}); + dev_ctx.template Alloc(&weight_whole); + weight_to_tensor(place, stream, running_weight_list, &weight_whole); + weight_data = weight_whole.data(); + } else { + weight_data = const_cast(running_weight_list[0]->data()); + } + + phi::DenseTensor weight_grad; + phi::funcs::SetConstant zero; + weight_grad.Resize({weight_numel}); + dev_ctx.template Alloc(&weight_grad); + zero(dev_ctx, &weight_grad, static_cast(0.0)); + T *weight_grad_data = weight_grad.data(); + + int offset = 0; + for (size_t i = 0; i < weight_grad_list.size(); ++i) { + size_t len = weight_grad_list[i]->numel(); + auto dim = weight_grad_list[i]->dims(); + weight_grad_list[i] + ->ShareDataWith(weight_grad.Slice(static_cast(offset), + static_cast(offset + len))) + .Resize(dim); + offset += len; + } + + x_grad->Resize(input_dims); + dev_ctx.template Alloc(x_grad); + auto *in_grad_data = x_grad->data(); + + if (init_h_grad) { + init_h_grad->Resize(init_h_dims); + dev_ctx.template Alloc(init_h_grad); + } + auto *init_h_grad_data = init_h_grad ? init_h_grad->data() : nullptr; + + if (init_c_grad) { + init_c_grad->Resize(init_c_dims); + dev_ctx.template Alloc(init_c_grad); + } + auto *init_c_grad_data = init_c_grad ? init_c_grad->data() : nullptr; + + auto running_seq_length = sequence_length.get_ptr(); + bool has_seq_length = running_seq_length != nullptr; + std::vector SequenceLength; + if (has_seq_length) { + SequenceLength = phi::GetVectorFromTensor(running_seq_length); + } + + int seq_length = input_dims[0]; + int batch_size = x.dims()[1]; + int input_size = x.dims()[2]; + + size_t workspace_size; + size_t reserve_size; + + ScopedRNNBase rnn(seq_length, + batch_size, + input_size, + hidden_size, + num_layers, + dropout_prob, + seed, + weight_numel, + true, + is_bidirec); + + rnn.Create(handle, + dev_ctx.GetPlace(), + SequenceLength, + &workspace_size, + &reserve_size, + const_cast(&state_out)); + + phi::DenseTensor workspace_data_; + workspace_data_.Resize({static_cast(workspace_size)}); + dev_ctx.template Alloc(&workspace_data_); + const uint8_t *reserve_data = reserve.data(); + +#if CUDNN_VERSION >= 90000 + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardData_v8( + handle, + rnn.rnn_desc(), + nullptr, + rnn.y_seq_desc(), + out_data, + out_grad_data, + rnn.x_seq_desc(), + in_grad_data, + rnn.init_h_desc(), + init_h_data, + last_h_grad_data, + init_h_grad_data, + rnn.init_c_desc(), + init_c_data, + last_c_grad_data, + init_c_grad_data, + rnn.weights_size(), + weight_data, + workspace_size, + workspace_data_.data(), + reserve_size, + const_cast(reserve_data))); + + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights_v8( + handle, + rnn.rnn_desc(), + CUDNN_WGRAD_MODE_ADD, + nullptr, + rnn.x_seq_desc(), + x.data(), + rnn.init_h_desc(), + init_h.data(), + rnn.y_seq_desc(), + out.data(), + rnn.weights_size(), + weight_grad_data, + workspace_size, + workspace_data_.data(), + reserve_size, + const_cast(reserve_data))); +#else + + if (!has_seq_length) { +// This interface is used when the input/output is unpadded. +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenRNNBackwardData(handle, + rnn.rnn_desc(), + seq_length, + rnn.y_descs(), + out_data, + rnn.y_descs(), + out_grad_data, + rnn.last_h_desc(), + last_h_grad_data, + rnn.last_c_desc(), + last_c_grad_data, + rnn.weight_desc(), + weight_data, + rnn.init_h_desc(), + init_h_data, + rnn.init_c_desc(), + init_c_data, + rnn.x_descs(), + in_grad_data, + rnn.init_h_desc(), + init_h_grad_data, + rnn.init_c_desc(), + init_c_grad_data, + workspace_data_.data(), + workspace_size, + const_cast(reserve_data), + reserve_size)); + + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNBackwardWeights( + handle, + rnn.rnn_desc(), + seq_length, + rnn.x_descs(), + x.data(), + rnn.init_h_desc(), + init_h.data(), + rnn.y_descs(), + out.data(), + rnn.weight_desc(), + weight_grad_data, + workspace_data_.data(), + workspace_size, + const_cast(reserve_data), + reserve_size)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnRNNBackwardData(handle, + rnn.rnn_desc(), + seq_length, + rnn.y_descs(), + out_data, + rnn.y_descs(), + out_grad_data, + rnn.last_h_desc(), + last_h_grad_data, + rnn.last_c_desc(), + last_c_grad_data, + rnn.weight_desc(), + weight_data, + rnn.init_h_desc(), + init_h_data, + rnn.init_c_desc(), + init_c_data, + rnn.x_descs(), + in_grad_data, + rnn.init_h_desc(), + init_h_grad_data, + rnn.init_c_desc(), + init_c_grad_data, + workspace_data_.data(), + workspace_size, + const_cast(reserve_data), + reserve_size)); + + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights( + handle, + rnn.rnn_desc(), + seq_length, + rnn.x_descs(), + x.data(), + rnn.init_h_desc(), + init_h.data(), + rnn.y_descs(), + out.data(), + workspace_data_.data(), + workspace_size, + rnn.weight_desc(), + weight_grad_data, + const_cast(reserve_data), + reserve_size)); +#endif + } else { +#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201 + // for train + // This interface is used when the input/output is padded. + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardDataEx( + handle, + rnn.rnn_desc(), + rnn.y_seq_desc(), + out_data, + rnn.y_seq_desc(), + out_grad_data, + nullptr, + nullptr, + rnn.last_h_desc(), + last_h_grad_data, + rnn.last_c_desc(), + last_c_grad_data, + rnn.weight_desc(), + weight_data, + rnn.init_h_desc(), + init_h_data, + rnn.init_c_desc(), + init_c_data, + rnn.x_seq_desc(), + in_grad_data, + rnn.init_h_desc(), + init_h_grad_data, + rnn.init_c_desc(), + init_c_grad_data, + nullptr, + nullptr, + workspace_data_.data(), + workspace_size, + const_cast(reserve_data), + reserve_size)); + + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeightsEx( + handle, + rnn.rnn_desc(), + rnn.x_seq_desc(), + x.data(), + rnn.init_h_desc(), + init_h.data(), + rnn.y_seq_desc(), + out.data(), + workspace_data_.data(), + workspace_size, + rnn.weight_desc(), + weight_grad_data, + const_cast(reserve_data), + reserve_size)); +#else + PADDLE_THROW(common::errors::Unavailable( + "The padded input of rnn is supported by cudnnRNNBackwardDataEx, " + "cudnnRNNBackwardWeightsEx, but it only works when the version " + "of cudnn is larger than 7.2.1")); +#endif + } + +#endif // end CUDNN_VERSION >= 90000 +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL( + cudnn_lstm_grad, GPU, ALL_LAYOUT, phi::CudnnLSTMGradKernel, float) {} +#else +PD_REGISTER_PLUGIN_KERNEL(cudnn_lstm_grad, + metax_gpu, + ALL_LAYOUT, + phi::CudnnLSTMGradKernel, + float, + double) {} +#endif diff --git a/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_kernel.cu new file mode 100644 index 00000000000..6bb94c9281a --- /dev/null +++ b/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_kernel.cu @@ -0,0 +1,428 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "glog/logging.h" +#include "kernels/metax_kernel/metax_context.h" //NOLINT +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cudnn_lstm_kernel.h" +#include "paddle/phi/kernels/gpu/cudnn_lstm_utils.h" + +namespace phi { + +template +#ifdef PADDLE_WITH_HIP +void LSTMInference(const bool &has_seq_length, + const miopenHandle_t &handle, +#else +void LSTMInference(const bool &has_seq_length, + const cudnnHandle_t &handle, +#endif + const int &seq_length, + ScopedRNNBase *rnn, + const T *x_data, + const T *init_h_data, + const T *init_c_data, + const T *w_data, + T *out_data, + T *last_h_data, + T *last_c_data, + phi::DenseTensor *workspace_data, + const size_t &workspace_size) { +#if CUDNN_VERSION >= 90000 + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnRNNForward(handle, + rnn->rnn_desc(), + CUDNN_FWD_MODE_INFERENCE, + nullptr, + rnn->x_seq_desc(), + x_data, + rnn->y_seq_desc(), + out_data, + rnn->init_h_desc(), + init_h_data, + last_h_data, + rnn->init_c_desc(), + init_c_data, + last_c_data, + rnn->weights_size(), + w_data, + workspace_size, + workspace_data->data(), + 0, + nullptr)); + +#else + + if (!has_seq_length) { +// for inference +// This interface is used when the input/output is unpadded. +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::miopenRNNForwardInference(handle, + rnn->rnn_desc(), + seq_length, + rnn->x_descs(), + x_data, + rnn->init_h_desc(), + init_h_data, + rnn->init_c_desc(), + init_c_data, + rnn->weight_desc(), + w_data, + rnn->y_descs(), + out_data, + rnn->last_h_desc(), + last_h_data, + rnn->last_c_desc(), + last_c_data, + workspace_data->data(), + workspace_size)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnRNNForwardInference(handle, + rnn->rnn_desc(), + seq_length, + rnn->x_descs(), + x_data, + rnn->init_h_desc(), + init_h_data, + rnn->init_c_desc(), + init_c_data, + rnn->weight_desc(), + w_data, + rnn->y_descs(), + out_data, + rnn->last_h_desc(), + last_h_data, + rnn->last_c_desc(), + last_c_data, + workspace_data->data(), + workspace_size)); +#endif + } else { +#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201 + // for inference + // This interface is used when the input/output is padded. + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardInferenceEx( + handle, + rnn->rnn_desc(), + rnn->x_seq_desc(), + x_data, + rnn->init_h_desc(), + init_h_data, + rnn->init_c_desc(), + init_c_data, + rnn->weight_desc(), + w_data, + rnn->y_seq_desc(), + out_data, + rnn->last_h_desc(), + last_h_data, + rnn->last_c_desc(), + last_c_data, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + workspace_data->data(), + workspace_size)); +#else + // CUDNN VERSION has to >=7.2.1 + PADDLE_THROW(common::errors::Unavailable( + "The padded input is supported by " + "cudnnRNNForwardInferenceEx, but it only works when " + "the version of cudnn is larger than 7.2.1")); +#endif + } + +#endif // end CUDNN_VERSION >= 90000 +} + +template +void CudnnLSTMKernel( + const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &init_h, + const DenseTensor &init_c, + const paddle::optional &w, + const paddle::optional> &weight_list, + const paddle::optional &sequence_length, + float dropout_prob, + bool is_bidirec, + int hidden_size, + int num_layers, + bool is_test, + int seed, + DenseTensor *out, + DenseTensor *last_h, + DenseTensor *last_c, + DenseTensor *reserve, + DenseTensor *state_out) { + const T *x_data = x.data(); + const T *init_h_data = init_h.data(); + const T *init_c_data = init_c.data(); + + T *out_data = dev_ctx.template Alloc(out); + T *last_h_data = dev_ctx.template Alloc(last_h); + T *last_c_data = dev_ctx.template Alloc(last_c); + + if (!is_test) { + if (seed == 0) { + // If not specify seed, use global Generator to generate seed. + int device_id = dev_ctx.GetPlace().GetDeviceId(); + auto gen_cuda = phi::DefaultCUDAGenerator(device_id); + seed = static_cast(gen_cuda->Random64()); + } + } + + auto *running_sequence_length = sequence_length.get_ptr(); + bool has_seq_length = running_sequence_length != nullptr; + std::vector SequenceLength; + if (has_seq_length) { + SequenceLength = phi::GetVectorFromTensor(running_sequence_length); + } + + // auto handle = dev_ctx.cudnn_handle(); + auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); + + int seq_length = x.dims()[0]; + int batch_size = x.dims()[1]; + int input_size = x.dims()[2]; + bool state_initialized = state_out->initialized() ? true : false; + + size_t workspace_size; + size_t reserve_size; + phi::DenseTensor weight_whole; + T *w_data = nullptr; + int weight_numel; + bool w_initialized = false; + auto place = dev_ctx.GetPlace(); + auto stream = dev_ctx.stream(); + auto *running_w = w.get_ptr(); + if (is_test && running_w != nullptr) { + w_initialized = running_w->initialized() ? true : false; + weight_numel = running_w->numel(); + } + if (!w_initialized) { + auto running_weight_list = *weight_list.get_ptr(); + bool continuous = is_continuous>( + running_weight_list); + weight_numel = size_sum(running_weight_list); + + if (!continuous) { + LOG_FIRST_N(WARNING, 2) + << "If the memory space of the Input WeightList is not continuous, " + "less efficient calculation will be called. Please call " + "flatten_parameters() to make the input memory continuous."; + weight_whole.Resize({weight_numel}); + dev_ctx.template Alloc(&weight_whole); + weight_to_tensor(place, stream, running_weight_list, &weight_whole); + w_data = weight_whole.data(); + if (is_test) { // maybe also reset small weights' ptr for training + int offset = 0; + for (size_t i = 0; i < running_weight_list.size(); ++i) { + size_t len = running_weight_list[i]->numel(); + auto dim = running_weight_list[i]->dims(); + const_cast(running_weight_list[i]) + ->ShareDataWith( + weight_whole.Slice(static_cast(offset), + static_cast(offset + len))) + .Resize(dim); + offset += len; + } + } + } else { + w_data = const_cast(running_weight_list[0]->data()); + } + } else { + w_data = const_cast(running_w->data()); + } + + ScopedRNNBase rnn(seq_length, + batch_size, + input_size, + hidden_size, + num_layers, + dropout_prob, + seed, + weight_numel, + state_initialized, + is_bidirec); + rnn.Create(handle, + dev_ctx.GetPlace(), + SequenceLength, + &workspace_size, + &reserve_size, + state_out); + + phi::DenseTensor workspace_data_; + workspace_data_.Resize({static_cast(workspace_size)}); + dev_ctx.template Alloc(&workspace_data_); + + reserve->Resize({static_cast(reserve_size)}); + auto *reserve_data = dev_ctx.template Alloc(reserve); + + if (is_test) { + LSTMInference(has_seq_length, + handle, + seq_length, + &rnn, + x_data, + init_h_data, + init_c_data, + w_data, + out_data, + last_h_data, + last_c_data, + &workspace_data_, + workspace_size); + } else { +#if CUDNN_VERSION >= 90000 + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnRNNForward(handle, + rnn.rnn_desc(), + CUDNN_FWD_MODE_TRAINING, + nullptr, + rnn.x_seq_desc(), + x_data, + rnn.y_seq_desc(), + out_data, + rnn.init_h_desc(), + init_h_data, + last_h_data, + rnn.init_c_desc(), + init_c_data, + last_c_data, + rnn.weights_size(), + w_data, + workspace_size, + workspace_data_.data(), + reserve_size, + reserve_data)); +#else + + if (!has_seq_length) { +// for train +// This interface is used when the input/output is unpadded. +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNForwardTraining( + handle, + rnn.rnn_desc(), + seq_length, + rnn.x_descs(), + x_data, + rnn.init_h_desc(), + init_h_data, + rnn.init_c_desc(), + init_c_data, + rnn.weight_desc(), + w_data, + rnn.y_descs(), + out_data, + rnn.last_h_desc(), + last_h_data, + rnn.last_c_desc(), + last_c_data, + workspace_data_.data(), + workspace_size, + reserve_data, + reserve_size)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cudnnRNNForwardTraining(handle, + rnn.rnn_desc(), + seq_length, + rnn.x_descs(), + x_data, + rnn.init_h_desc(), + init_h_data, + rnn.init_c_desc(), + init_c_data, + rnn.weight_desc(), + w_data, + rnn.y_descs(), + out_data, + rnn.last_h_desc(), + last_h_data, + rnn.last_c_desc(), + last_c_data, + workspace_data_.data(), + workspace_size, + reserve_data, + reserve_size)); +#endif + } else { +#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201 + // for train + // This interface is used when the input/output is padded. + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardTrainingEx( + handle, + rnn.rnn_desc(), + rnn.x_seq_desc(), + x_data, + rnn.init_h_desc(), + init_h_data, + rnn.init_c_desc(), + init_c_data, + rnn.weight_desc(), + w_data, + rnn.y_seq_desc(), + out_data, + rnn.last_h_desc(), + last_h_data, + rnn.last_c_desc(), + last_c_data, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + workspace_data_.data(), + workspace_size, + reserve_data, + reserve_size)); +#else + PADDLE_THROW(common::errors::Unavailable( + "The padded input is supported by " + "cudnnRNNForwardTrainingEx, but it only works when " + "the version of cudnn is larger than 7.2.1")); +#endif + } +#endif // end CUDNN_VERSION >= 90000 + } +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(cudnn_lstm, GPU, ALL_LAYOUT, phi::CudnnLSTMKernel, float) { + kernel->InputAt(5).SetDataType(phi::DataType::INT32); + kernel->OutputAt(3).SetDataType(phi::DataType::UINT8); + kernel->OutputAt(4).SetDataType(phi::DataType::UINT8); +} +#else +PD_REGISTER_PLUGIN_KERNEL( + cudnn_lstm, metax_gpu, ALL_LAYOUT, phi::CudnnLSTMKernel, float, double) { + kernel->InputAt(5).SetDataType(phi::DataType::INT32); + kernel->OutputAt(3).SetDataType(phi::DataType::UINT8); + kernel->OutputAt(4).SetDataType(phi::DataType::UINT8); +} +#endif diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt index b4f1afbe5b0..4e54e17b3ef 100644 --- a/backends/metax_gpu/tests/ignore.txt +++ b/backends/metax_gpu/tests/ignore.txt @@ -19,3 +19,7 @@ test_uniform_random_op test_c_embedding_op test_slice_op test_compare_op +test_conv3d_transpose_op +test_conv3d_layer +test_conv3d_transpose_part2_op +test_fused_conv2d_add_act_op From cccf6b7e68cbaedd28c666773020d094556ab251 Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Tue, 30 Sep 2025 14:12:32 +0800 Subject: [PATCH 56/58] [metax] rename yaml file (#77) * [metax]fix patch and fix missing kernel * [metax] link mccl and fix missing kernel * [metax] rename yaml file --------- --- .github/workflows/metax_work.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml index aff530d475c..f14023848c6 100644 --- a/.github/workflows/metax_work.yaml +++ b/.github/workflows/metax_work.yaml @@ -1,4 +1,4 @@ -name: padlle metax gpu test +name: paddle metax gpu test on: workflow_dispatch: From 7a7a7a0590eb0b61be1bd7a911f37dfd521cc2ec Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Tue, 30 Sep 2025 14:30:16 +0800 Subject: [PATCH 57/58] [metax] rm file (#78) * [metax]fix patch and fix missing kernel * [metax] link mccl and fix missing kernel * [metax] rename yaml file * [metax] rm file * [metax] rm file --------- --- .../cuda_kernels/gammaln_grad_kernel.cu | 28 ----- .../kernels/impl/gammaln_grad_kernel_impl.h | 112 ------------------ .../kernels/metax_kernel/rnn_kernel.cu.cc | 2 + 3 files changed, 2 insertions(+), 140 deletions(-) delete mode 100644 backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu delete mode 100644 backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h diff --git a/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu deleted file mode 100644 index c6bd53f007f..00000000000 --- a/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "kernels/impl/gammaln_grad_kernel_impl.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/gammaln_grad_kernel.h" - -PD_REGISTER_PLUGIN_KERNEL(gammaln_grad, - metax_gpu, - ALL_LAYOUT, - phi::GammalnGradKernel, - float, - double, - phi::float16, - phi::bfloat16) {} diff --git a/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h deleted file mode 100644 index 2b222ba3b2c..00000000000 --- a/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/kernels/funcs/for_range.h" - -namespace phi { -template -HOSTDEVICE T digamma_positive_domain(T x) { - constexpr T c = T{8.5}; - constexpr T euler_mascheroni = T{0.57721566490153286060}; - T r; - T value; - T x2; - - if (x <= T{0.000001}) { - value = -euler_mascheroni - T{1.0} / x + T{1.6449340668482264365} * x; - return value; - } - - value = T{0.0}; - x2 = x; - while (x2 < c) { - value = value - T{1.0} / x2; // NOLINT - x2 = x2 + T{1.0}; - } - - r = T{1.0} / x2; - value = value + std::log(x2) - T{0.5} * r; - - r = r * r; - - value = value - - r * (T{1.0} / T{12.0} - - r * (T{1.0} / T{120.0} - - r * (T{1.0} / T{252.0} - - r * (T{1.0} / T{240.0} - r * (T{1.0} / T{132.0}))))); - - return value; -} - -template -HOSTDEVICE T digamma(T x) { - const static T pi = T{3.14159265358979323846}; // NOLINT - - if (x == T{0.0}) { - T inf = std::numeric_limits::infinity(); - return std::signbit(x) ? inf : -inf; - } else if (x < T{0.0}) { - if (x == std::trunc(x)) { - return std::numeric_limits::quiet_NaN(); - } else { - T iptr; - T frac_part = std::modf(x, &iptr); - return digamma_positive_domain(T{1.0} - x) - - pi / std::tan(pi * frac_part); - } - } else { - return digamma_positive_domain(x); - } -} - -template -struct GammalnGradFunctor { - GammalnGradFunctor(const T* dout, const T* x, T* output, int64_t numel) - : dout_(dout), x_(x), output_(output), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { - using MT = typename phi::dtype::MPTypeTrait::Type; - const MT mp_dout = static_cast(dout_[idx]); - const MT mp_x = static_cast(x_[idx]); - output_[idx] = static_cast(mp_dout * digamma(mp_x)); - } - - private: - const T* dout_; - const T* x_; - T* output_; - int64_t numel_; -}; -template -void GammalnGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& d_out, - DenseTensor* d_x) { - auto numel = d_out.numel(); - if (d_x && d_x->numel() == 0) { - dev_ctx.template Alloc(d_x); - return; - } - auto* dout_data = d_out.data(); - auto* x_data = x.data(); - auto* dx_data = - dev_ctx.template Alloc(d_x, static_cast(numel * sizeof(T))); - phi::funcs::ForRange for_range(dev_ctx, numel); - GammalnGradFunctor functor(dout_data, x_data, dx_data, numel); - for_range(functor); -} -} // namespace phi diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc index 2598ce093e6..fa2c9e6e8b7 100644 --- a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc +++ b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc @@ -181,6 +181,7 @@ void RnnKernel(const Context &dev_ctx, else if (mode == "RNN_TANH") rnn_mode = miopenRNNTANH; #else + VLOG(0) << "Leave lstmKernel.11"; gpuRNNMode_t rnn_mode = CUDNN_LSTM; if (mode == "LSTM") rnn_mode = CUDNN_LSTM; @@ -228,6 +229,7 @@ void RnnKernel(const Context &dev_ctx, common::errors::InvalidArgument( "ROCm do not support SequenceLength yet.")); #endif + VLOG(0) << "Leave lstmKernel.12"; std::vector SequenceLength; if (has_seq_length) { SequenceLength = phi::GetVectorFromTensor(sequence_length.get_ptr()); From 5a76d35b53e1f7d970d6b388969ba56ae955dc0d Mon Sep 17 00:00:00 2001 From: duqimeng <77875733+duqimeng@users.noreply.github.com> Date: Tue, 30 Sep 2025 17:18:00 +0800 Subject: [PATCH 58/58] metax_fix_ci (#79) * [metax] add Rules --------- --- .github/workflows/metax_work.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml index f14023848c6..f73442b6fd5 100644 --- a/.github/workflows/metax_work.yaml +++ b/.github/workflows/metax_work.yaml @@ -7,6 +7,7 @@ on: branches: [develop, release/**] paths: - "**" + - "Paddle/**" - "!backends/**" - "backends/metax_gpu/**"