algorithm1832
diff --git a/‎Paddle‎ b/‎Paddle‎
diff --git a/‎backends/iluvatar_gpu/CMakeLists.txt‎
Lines changed: 3 additions & 2 deletions b/‎backends/iluvatar_gpu/CMakeLists.txt‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎backends/iluvatar_gpu/common/cuda_flags.cc‎
Lines changed: 16 additions & 0 deletions b/‎backends/iluvatar_gpu/common/cuda_flags.cc‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎backends/iluvatar_gpu/kernels/cuda_kernels/batched_gemm_kernel_register.cu‎
Lines changed: 23 additions & 0 deletions b/‎backends/iluvatar_gpu/kernels/cuda_kernels/batched_gemm_kernel_register.cu‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel.cu‎
Lines changed: 0 additions & 200 deletions b/‎backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel.cu‎
Lines changed: 0 additions & 200 deletions
diff --git a/‎backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu‎
Lines changed: 24 additions & 0 deletions b/‎backends/iluvatar_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu‎
Lines changed: 24 additions & 0 deletions
@@ -112,7 +112,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cuda_driver.cc
   # Core
   ${PADDLE_SOURCE_DIR}/paddle/phi/core/enforce.cc
-  ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc
+  # ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/core/mixed_vector.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cusparse.cc
   # kernels/funcs
@@ -128,6 +128,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/expand_modality_expert_id_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_grad_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/batched_gemm.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/yolo_box_post_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/spectral_norm_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/spectral_norm_kernel.cu
@@ -876,7 +877,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_stack_transpose_quant_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_swiglu_weighted_bwd_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc
+  # ${PADDLE_SOURCE_DIR}/paddle/phi/core/flags.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math_function.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
 
@@ -277,3 +277,19 @@ PHI_DEFINE_EXPORTED_bool(
     flash_attn_available,
     true,
     "Weather flash attention is available on the current device.");
+
+/**
+ * CUDNN related FLAG
+ * Name: FLAGS_conv_workspace_size_limit
+ * Since Version: 0.13.0
+ * Value Range: uint64, default=512 (MB)
+ * Example:
+ * Note: The internal function of cuDNN obtains the fastest matching algorithm
+ *       within this memory limit. Usually, faster algorithms can be chosen in
+ *       larger workspaces, but memory space can also be significantly
+ * increased.
+ *       Users need to balance memory and speed.
+ */
+PHI_DEFINE_EXPORTED_int64(conv_workspace_size_limit,
+                          1024,
+                          "cuDNN convolution workspace limit in MB unit.");
@@ -0,0 +1,23 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/legacy/gpu/batched_gemm.h"
+
+PD_CUSTOM_KERNEL_REGISTER(batched_gemm,
+                          iluvatar_gpu,
+                          ALL_LAYOUT,
+                          phi::BatchedGEMM,
+                          float,
+                          phi::bfloat16) {}
@@ -0,0 +1,24 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu"  //NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(fused_rotary_position_embedding_grad,
+                          iluvatar_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::FusedRopeGradKernel,
+                          float,
+                          phi::dtype::float16,
+                          phi::dtype::bfloat16){};