Skip to content

Commit 6730b0e

Browse files
YqGe585duqimeng
andauthored
Fix fused rope kernel and update paddle (PaddlePaddle#2268)
* fix fused rope kernel and update paddle * fix flag * [Metax] Fix fused compile bug and remove some patch (PaddlePaddle#198) * fix flags * update disabled tests --------- Co-authored-by: duqimeng <[email protected]>
1 parent 8e2c4f8 commit 6730b0e

File tree

13 files changed

+101
-942
lines changed

13 files changed

+101
-942
lines changed

Paddle

Submodule Paddle updated 43 files

backends/iluvatar_gpu/CMakeLists.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ file(
112112
${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cuda_driver.cc
113113
# Core
114114
${PADDLE_SOURCE_DIR}/paddle/phi/core/enforce.cc
115-
${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc
115+
# ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc
116116
${PADDLE_SOURCE_DIR}/paddle/phi/core/mixed_vector.cc
117117
${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cusparse.cc
118118
# kernels/funcs
@@ -128,6 +128,7 @@ file(
128128
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/expand_modality_expert_id_kernel.cu
129129
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_kernel.cu
130130
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_grad_kernel.cu
131+
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/batched_gemm.cu
131132
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/yolo_box_post_kernel.cu
132133
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/spectral_norm_grad_kernel.cu
133134
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/spectral_norm_kernel.cu
@@ -876,7 +877,7 @@ file(
876877
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_stack_transpose_quant_kernel.cu
877878
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
878879
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_swiglu_weighted_bwd_kernel.cu
879-
${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc
880+
# ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc
880881
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math_function.cc
881882
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_kernel.cu
882883
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu

backends/iluvatar_gpu/common/cuda_flags.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,3 +277,19 @@ PHI_DEFINE_EXPORTED_bool(
277277
flash_attn_available,
278278
true,
279279
"Weather flash attention is available on the current device.");
280+
281+
/**
282+
* CUDNN related FLAG
283+
* Name: FLAGS_conv_workspace_size_limit
284+
* Since Version: 0.13.0
285+
* Value Range: uint64, default=512 (MB)
286+
* Example:
287+
* Note: The internal function of cuDNN obtains the fastest matching algorithm
288+
* within this memory limit. Usually, faster algorithms can be chosen in
289+
* larger workspaces, but memory space can also be significantly
290+
* increased.
291+
* Users need to balance memory and speed.
292+
*/
293+
PHI_DEFINE_EXPORTED_int64(conv_workspace_size_limit,
294+
1024,
295+
"cuDNN convolution workspace limit in MB unit.");
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "paddle/phi/core/kernel_registry.h"
16+
#include "paddle/phi/kernels/legacy/gpu/batched_gemm.h"
17+
18+
PD_CUSTOM_KERNEL_REGISTER(batched_gemm,
19+
iluvatar_gpu,
20+
ALL_LAYOUT,
21+
phi::BatchedGEMM,
22+
float,
23+
phi::bfloat16) {}

backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel.cu

Lines changed: 0 additions & 200 deletions
This file was deleted.
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "paddle/phi/core/kernel_registry.h"
16+
#include "paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu" //NOLINT
17+
18+
PD_CUSTOM_KERNEL_REGISTER(fused_rotary_position_embedding_grad,
19+
iluvatar_gpu,
20+
ALL_LAYOUT,
21+
phi::fusion::FusedRopeGradKernel,
22+
float,
23+
phi::dtype::float16,
24+
phi::dtype::bfloat16){};

0 commit comments

Comments
 (0)