Skip to content

Commit 01c26ab

Browse files
authored
fix fc kernel diff (#49781)
* fix fc kernel diff * disable fc_elementwise_layernorm_fuse_pass
1 parent 8a93404 commit 01c26ab

File tree

3 files changed

+11
-16
lines changed

3 files changed

+11
-16
lines changed

paddle/fluid/inference/api/paddle_pass_builder.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,8 +171,9 @@ const std::vector<std::string> kGpuLowerPrecisionPasses{
171171
"multi_devices_fused_multi_transformer_decoder_fuse_qkv_pass",
172172
"gpu_cpu_map_matmul_v2_to_mul_pass",
173173
"gpu_cpu_map_matmul_v2_to_matmul_pass",
174+
"gpu_cpu_map_matmul_to_mul_pass",
174175
"fc_fuse_pass",
175-
"fc_elementwise_layernorm_fuse_pass",
176+
// "fc_elementwise_layernorm_fuse_pass",
176177
"embedding_eltwise_layernorm_fuse_pass",
177178
"runtime_context_cache_pass",
178179
};

paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -276,9 +276,9 @@ __global__ void InplaceAddReluAddLayerNormKernel(const float16* y_data,
276276
half tmp_0 = __hdiv(__hsub(save_ptr[save_index], mean_i), std_i);
277277
half tmp_1 = scale ? __hmul(scale[j], tmp_0) : tmp_0;
278278
#else
279-
half tmp_0 = static_cast<half>(static_cast<float>(save_ptr[save_index]) -
280-
static_cast<float>(mean_i) /
281-
static_cast<float>(std_i));
279+
half tmp_0 = static_cast<half>((static_cast<float>(save_ptr[save_index]) -
280+
static_cast<float>(mean_i)) /
281+
static_cast<float>(std_i));
282282
half tmp_1 = scale ? static_cast<half>(static_cast<float>(scale[j]) *
283283
static_cast<float>(tmp_0))
284284
: tmp_0;
@@ -394,19 +394,16 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
394394
auto* out_data = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
395395

396396
auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
397-
blas.GEMM(false,
398-
false,
397+
blas.GEMM(CblasNoTrans,
398+
CblasNoTrans,
399399
M,
400400
N,
401401
K,
402402
static_cast<T>(1.0),
403403
x_data,
404-
K,
405404
w_data,
406-
N,
407405
static_cast<T>(0.0),
408-
out_data,
409-
N);
406+
out_data);
410407
auto* y = ctx.Input<framework::Tensor>("Y");
411408
auto* bias_0 = ctx.Input<framework::Tensor>("Bias0");
412409
auto* bias_1 = ctx.Input<framework::Tensor>("Bias1");

paddle/phi/kernels/funcs/fc_functor.cu

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -292,19 +292,16 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
292292
errors::PermissionDenied(
293293
"Weight padding in fc can not be used in GPU scope."));
294294
auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
295-
blas.GEMM(false,
296-
false,
295+
blas.GEMM(CblasNoTrans,
296+
CblasNoTrans,
297297
M,
298298
N,
299299
K,
300300
static_cast<T>(1.0),
301301
X,
302-
K,
303302
W,
304-
N,
305303
static_cast<T>(0.0),
306-
Y,
307-
N);
304+
Y);
308305
if (B == NULL) {
309306
return;
310307
}

0 commit comments

Comments
 (0)