diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 2e0d021f102a55..bb60cca94f3d76 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -253,9 +253,9 @@ if(WITH_XPU_XRE5)
     DOWNLOAD_COMMAND
       bash ${CMAKE_SOURCE_DIR}/tools/xpu/pack_paddle_dependence.sh
       ${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XHPC_URL} ${XPU_XHPC_DIR_NAME}
-      ${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME} 1 ${WITH_MKL}
-      "${CMAKE_SOURCE_DIR}/build" && wget ${XPU_XFT_GET_DEPENCE_URL} && bash
-      ${XFT_COMMAND} ${XPU_XFT_URL} ${XPU_XFT_DIR_NAME} && bash
+      ${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME} 1 ${WITH_MKL} "${CMAKE_BINARY_DIR}"
+      && wget ${XPU_XFT_GET_DEPENCE_URL} && bash ${XFT_COMMAND} ${XPU_XFT_URL}
+      ${XPU_XFT_DIR_NAME} && bash
       ${CMAKE_SOURCE_DIR}/tools/xpu/get_xpti_dependence.sh ${XPU_XPTI_URL}
       ${XPU_XPTI_DIR_NAME} && bash
       ${CMAKE_SOURCE_DIR}/tools/xpu/get_xpufft_dependence.sh ${XPU_FFT_URL}
diff --git a/paddle/cinn/common/integer_set.cc b/paddle/cinn/common/integer_set.cc
index a4fa9ecbae1afe..efe54a192a7d2a 100644
--- a/paddle/cinn/common/integer_set.cc
+++ b/paddle/cinn/common/integer_set.cc
@@ -164,7 +164,8 @@ cas_intervals_t CollectVarIntervalsOfExprs(const std::vector<ir::Expr>& exprs,
           lower_bound = ir::Expr(1);
         }
         var_intervals.insert(
-            {var->name, CasInterval(lower_bound, upper_bound)});
+            {var->name,
+             CasInterval(lower_bound, NormalizeUpperBound(upper_bound))});
       }
       return false;
     });
@@ -572,6 +573,9 @@ class BoundReplacer : public ir::IRMutator<> {
 ir::Expr SymbolicExprAnalyzer::LowerBound(const ir::Expr& expr) const {
   BoundReplacer bound_replacer(var_intervals_, true);
   ir::Expr bound = ir::ir_utils::IRCopy(expr);
+  if (bound.is_index()) {
+    bound = bound.as_index().Normalize(ir::IndexExpr::OptLevel::kLevel3);
+  }
   bound_replacer(&bound);
   return optim::ArithSimplify(bound);
 }
@@ -579,7 +583,11 @@ ir::Expr SymbolicExprAnalyzer::LowerBound(const ir::Expr& expr) const {
 ir::Expr SymbolicExprAnalyzer::UpperBound(const ir::Expr& expr) const {
   BoundReplacer bound_replacer(var_intervals_, false);
   ir::Expr bound = ir::ir_utils::IRCopy(expr);
+  if (bound.is_index()) {
+    bound = bound.as_index().Normalize(ir::IndexExpr::OptLevel::kLevel3);
+  }
   bound_replacer(&bound);
+
   return optim::ArithSimplify(bound);
 }
 
@@ -709,7 +717,8 @@ SingleIntervalIntSet::SingleIntervalIntSet(const ir::Expr& min,
                                    ? x->as_var()->upper_bound
                                    : SymbolicExprLimit::positive_inf;
         var_intervals_.insert(
-            {x->as_var()->name, CasInterval(lower_bound, upper_bound)});
+            {x->as_var()->name,
+             CasInterval(lower_bound, NormalizeUpperBound(upper_bound))});
       }
       return false;
     };
diff --git a/paddle/cinn/common/ir_util.cc b/paddle/cinn/common/ir_util.cc
index 32f967f09d7759..710d6a2c85f2a1 100644
--- a/paddle/cinn/common/ir_util.cc
+++ b/paddle/cinn/common/ir_util.cc
@@ -270,6 +270,16 @@ bool is_zero(Expr v) {
   return false;
 }
 
+Expr NormalizeUpperBound(Expr upper_bound, bool minus_one /* = true */) {
+  if (upper_bound == SymbolicExprLimit::positive_inf) {
+    return upper_bound;
+  }
+  if (minus_one) {
+    return upper_bound - ir::Expr(1);  // [lower, upper) to [lower, upper]
+  }
+  return upper_bound + ir::Expr(1);  // (lower, upper] to [lower, upper)
+}
+
 Expr CastIfNeeded(Expr body, Type type) {
   if (body.type() == type) return body;
   return ir::Cast::Make(type, body);
diff --git a/paddle/cinn/common/ir_util.h b/paddle/cinn/common/ir_util.h
index bbc81c2b64e5d3..d4486a052b9e70 100644
--- a/paddle/cinn/common/ir_util.h
+++ b/paddle/cinn/common/ir_util.h
@@ -91,6 +91,8 @@ std::vector<std::string> GatherItersToTensorProducer(
 
 bool is_zero(Expr v);
 
+Expr NormalizeUpperBound(Expr upper_bound, bool minus_one = true);
+
 bool MathEqual(const Expr &a, const Expr &b);
 
 //! helper function to get a ir::Select node.
diff --git a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc
index 6cbdfef7b11333..d59d77954934ce 100644
--- a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc
+++ b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc
@@ -136,7 +136,7 @@ std::vector<std::pair<SymbolicPredicate, ir::Expr>>
 DynamicShapeGroupScheduler::GetCX86IRs() {
   std::vector<std::pair<SymbolicPredicate, ir::Expr>> irs(1);
   irs[0].first = ir::EQ::Make(ir::Expr(1), ir::Expr(1));
-  irs[1].second = ir_sch_->GetModule().GetExprs()[0];
+  irs[0].second = ir_sch_->GetModule().GetExprs()[0];
   return irs;
 }
 
diff --git a/paddle/cinn/ir/group_schedule/tactic/arrange_storage_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/arrange_storage_tactic.cc
index 333846d6740568..2327d2f3aeeddd 100644
--- a/paddle/cinn/ir/group_schedule/tactic/arrange_storage_tactic.cc
+++ b/paddle/cinn/ir/group_schedule/tactic/arrange_storage_tactic.cc
@@ -141,7 +141,8 @@ IntSet Evaluate(Expr expr,
                 const std::unordered_map<ir::Var, IntSet>& var_domain) {
   Expr copy_for_upper_bound = ir::ir_utils::IRCopy(expr);
   Expr copy_for_lower_bound = ir::ir_utils::IRCopy(expr);
-  common::cas_intervals_t var_intervals;
+  common::cas_intervals_t
+      var_intervals;  // variable name -> CasIntervals[lower_bound, upper_bound]
   std::vector<ir::Expr> var_vec = ir::ir_utils::CollectIRNodesWithoutTensor(
       expr, [](const ir::Expr* x) { return x->as_var(); });
   for (Expr var_expr : var_vec) {
@@ -150,7 +151,9 @@ IntSet Evaluate(Expr expr,
       const ir::Var& fixed_var = fixed.at(var);
       var_intervals.emplace(
           fixed_var->name,
-          common::CasInterval(fixed_var->lower_bound, fixed_var->upper_bound));
+          common::CasInterval(
+              fixed_var->lower_bound,
+              cinn::common::NormalizeUpperBound(fixed_var->upper_bound)));
       optim::ReplaceVarWithExpr(&copy_for_lower_bound, var, Expr(fixed_var));
       optim::ReplaceVarWithExpr(&copy_for_upper_bound, var, Expr(fixed_var));
     } else if (var_domain.count(var) != 0) {
@@ -172,7 +175,8 @@ IntSet Evaluate(Expr expr,
           ::common::errors::InvalidArgument(
               "The 'upper_bound' of the variable must be defined."));
       optim::ReplaceVarWithExpr(&copy_for_lower_bound, var, var->lower_bound);
-      optim::ReplaceVarWithExpr(&copy_for_upper_bound, var, var->upper_bound);
+      optim::ReplaceVarWithExpr(
+          &copy_for_upper_bound, var, NormalizeUpperBound(var->upper_bound));
     }
   }
   ir::Expr lower_bound = optim::ArithSimplify(copy_for_lower_bound);
diff --git a/paddle/cinn/ir/ir.h b/paddle/cinn/ir/ir.h
index 7acf4e110cde2d..cf8b58cd6b57f7 100644
--- a/paddle/cinn/ir/ir.h
+++ b/paddle/cinn/ir/ir.h
@@ -421,6 +421,7 @@ struct _Var_ : public ExprNode<_Var_> {
 };
 
 //! A named variable.
+// i ∈ [lower_bound, upper_bound)
 struct Var : public IrNodeRef {
   Var() = default;
   explicit Var(IrNode* n) : IrNodeRef(n) {}
@@ -846,6 +847,7 @@ struct For : public ExprNode<For>, public ForBase {
   //! The minimum value of the iteration.
   Expr min;
   //! The extent of the iteration.
+  // loop_var ∈ [min, min + extent)
   Expr extent;
 
   Expr body;
diff --git a/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc b/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc
index 492738516e95a7..860d285b242aa6 100644
--- a/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc
+++ b/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc
@@ -621,7 +621,8 @@ std::vector<ir::Var> IndicesToVars(const std::vector<ir::Expr>& indices) {
     if (e.is_constant()) {
       std::string var_name =
           cinn::UniqName("constant" + static_cast<int>(e.get_constant()));
-      result.emplace_back(e, e, var_name, /* is_reduce = */ false);
+      result.emplace_back(
+          e, NormalizeUpperBound(e, false), var_name, /* is_reduce = */ false);
     } else if (e.As<ir::_Var_>() != nullptr) {
       ir::Expr copy_e = ir::ir_utils::IRCopy(e);
       ir::_Var_* var_ref = copy_e.As<ir::_Var_>();
@@ -635,14 +636,17 @@ std::vector<ir::Var> IndicesToVars(const std::vector<ir::Expr>& indices) {
           ir::Var var = x->as_var_ref();
           var_intervals.insert(
               {var->name,
-               common::CasInterval{var->lower_bound, var->upper_bound}});
+               common::CasInterval{var->lower_bound,
+                                   NormalizeUpperBound(var->upper_bound)}});
           if (var->is_reduce_axis) is_reduce = true;
         }
         return false;
       });
       common::SymbolicExprAnalyzer analyzer(var_intervals);
-      result.emplace_back(
-          analyzer.LowerBound(e), analyzer.UpperBound(e), var_name, is_reduce);
+      result.emplace_back(analyzer.LowerBound(e),
+                          NormalizeUpperBound(analyzer.UpperBound(e), false),
+                          var_name,
+                          is_reduce);
     }
   }
   return result;
diff --git a/paddle/cinn/optim/ir_simplify.cc b/paddle/cinn/optim/ir_simplify.cc
index f1b5d3dfc9f381..1457b61528976a 100644
--- a/paddle/cinn/optim/ir_simplify.cc
+++ b/paddle/cinn/optim/ir_simplify.cc
@@ -386,6 +386,296 @@ struct SimplifySelectMutator : public ir::IRMutator<> {
   }
 };
 
+/*
+Example 1:
+  Select(a <= b, b, a) → max(a, b)
+Example 2:
+  Select(a <= b, a, b) → min(a, b)
+Example 3:
+  Select(a <= MAX, max(a, MIN), MAX) → min(max(a, MIN), MAX)
+  Select(a <= MAX, max(MIN, a), MAX) → min(max(a, MIN), MAX)
+Example 4:
+  Select(MIN <= b, min(b, MAX), MIN) → max(min(b, MAX), MIN)
+                                     → min(max(b, MIN), MAX)
+  Select(MIN <= b, min(MAX, b), MIN) → max(min(b, MAX), MIN)
+                                     → min(max(b, MIN), MAX)
+*/
+struct SimplifySelect2MinMaxMutator : public ir::ExprMutator<> {
+  void operator()(Expr* x) { ir::ExprMutator<>::Visit(x, x); }
+
+  using ir::ExprMutator<>::Visit;
+
+  // Recursively optimize CompareOp operands
+  template <typename T>
+  void VisitCompare(T* op, Expr* expr) {
+    Expr a = op->a();
+    Expr b = op->b();
+    ir::ExprMutator<>::Visit(&a, &a);
+    ir::ExprMutator<>::Visit(&b, &b);
+
+    if (a.get() != op->a().get() || b.get() != op->b().get()) {
+      *expr = T::Make(a, b);
+    }
+  }
+
+  void Visit(const ir::GE* op, Expr* expr) override { VisitCompare(op, expr); }
+  void Visit(const ir::GT* op, Expr* expr) override { VisitCompare(op, expr); }
+  void Visit(const ir::LE* op, Expr* expr) override { VisitCompare(op, expr); }
+  void Visit(const ir::LT* op, Expr* expr) override { VisitCompare(op, expr); }
+
+  void Visit(const Select* op, Expr* expr) override {
+    auto* node = expr->As<ir::Select>();
+
+    // 1. Recursively optimize sub-expressions
+    Expr condition = node->condition;
+    Expr true_value = node->true_value;
+    Expr false_value = node->false_value;
+
+    ir::ExprMutator<>::Visit(&condition, &condition);
+    ir::ExprMutator<>::Visit(&true_value, &true_value);
+    ir::ExprMutator<>::Visit(&false_value, &false_value);
+
+    // 2. If sub-expressions are modified, rebuild the Select node
+    if (condition.get() != node->condition.get() ||
+        true_value.get() != node->true_value.get() ||
+        false_value.get() != node->false_value.get()) {
+      *expr = ir::Select::Make(condition, true_value, false_value);
+      node = expr->As<ir::Select>();
+    }
+
+    // 3. Function to optimize Select into Min/Max when possible
+    auto TryOptimizeSelect = [&](const Expr& a,
+                                 const Expr& b,
+                                 const Expr& x,
+                                 const Expr& y) -> Expr {
+      // Case 1: Select(a <= b, b, a) → max(a, b)
+      if (x == b && y == a) {
+        if (b.is_constant()) {
+          return ir::Max::Make(a, b);
+        } else {
+          return ir::Max::Make(b, a);
+        }
+      }
+      // Case 2: Select(a <= b, a, b) → min(a, b)
+      if (x == a && y == b) {
+        if (b.is_constant()) {
+          return ir::Min::Make(a, b);
+        } else {
+          return ir::Min::Make(b, a);
+        }
+      }
+      // Case 3: Select(a <= MAX, max(a, MIN), MAX) → min(max(a, MIN), MAX)
+      if (auto* max = x.As<ir::Max>()) {
+        if (max->a() == a) {
+          if (max->b().is_constant() && y.is_constant() && b.is_constant()) {
+            if (y.get_constant() == b.get_constant() &&
+                (max->b()).get_constant() <= y.get_constant()) {
+              return ir::Min::Make(ir::Max::Make(a, max->b()), b);
+            }
+          }
+        } else if (max->b() == a) {
+          // Select(a <= MAX, max(MIN, a), MAX) → min(max(a, MIN), MAX)
+          if (max->a().is_constant() && y.is_constant() && b.is_constant()) {
+            if (y.get_constant() == b.get_constant() &&
+                (max->a()).get_constant() <= y.get_constant()) {
+              return ir::Min::Make(ir::Max::Make(a, max->a()), b);
+            }
+          }
+        }
+      }
+      // Case 4: Select(MIN <= b, min(b, Max), MIN) → max(min(b, MAX), MIN)
+      //                                            → min(max(b, MIN), MAX)
+      if (auto* min = x.As<ir::Min>()) {
+        if (min->a() == b) {
+          if ((min->b()).is_constant() && y.is_constant() && a.is_constant()) {
+            if (y.get_constant() == a.get_constant() &&
+                y.get_constant() <= (min->b()).get_constant()) {
+              return ir::Min::Make(ir::Max::Make(b, a), min->b());
+            }
+          }
+        } else if (min->b() == b) {
+          // Select(MIN <= b, min(Max, b), MIN) → min(max(b, MIN), MAX)
+          if ((min->a()).is_constant() && y.is_constant() && a.is_constant()) {
+            if (y.get_constant() == a.get_constant() &&
+                y.get_constant() <= (min->a()).get_constant()) {
+              return ir::Min::Make(ir::Max::Make(b, a), min->a());
+            }
+          }
+        }
+      }
+      return Expr(nullptr);
+    };
+
+    // 4. Try to optimize different comparison conditions by converting them to
+    // <= logic
+    if (auto* ge = node->condition.As<ir::GE>()) {
+      // Select(a >= b, t, f) → Select(b <= a, t, f)
+      Expr optimized = TryOptimizeSelect(
+          ge->b(), ge->a(), node->true_value, node->false_value);
+      if (optimized.defined()) {
+        *expr = optimized;
+        return;
+      }
+    } else if (auto* gt = node->condition.As<ir::GT>()) {
+      // Select(a > b, t, f) → Select(a <= b, f, t)
+      Expr optimized = TryOptimizeSelect(
+          gt->a(), gt->b(), node->false_value, node->true_value);
+      if (optimized.defined()) {
+        *expr = optimized;
+        return;
+      }
+    } else if (auto* le = node->condition.As<ir::LE>()) {
+      // Select(a <= b, t, f) → Select(a <= b, t, f)
+      Expr optimized = TryOptimizeSelect(
+          le->a(), le->b(), node->true_value, node->false_value);
+      if (optimized.defined()) {
+        *expr = optimized;
+        return;
+      }
+    } else if (auto* lt = node->condition.As<ir::LT>()) {
+      // Select(a < b, t, f) → Select(b <= a, f, t)
+      Expr optimized = TryOptimizeSelect(
+          lt->b(), lt->a(), node->false_value, node->true_value);
+      if (optimized.defined()) {
+        *expr = optimized;
+        return;
+      }
+    }
+  }
+};
+
+// Optimizes pow(2.0f, ceil(log2(x))) pattern into more efficient bit
+// manipulation:
+// Original: pow(2.0f, ceil(log2(x)))
+// Optimized: ldexpf(1.0f, exponent) where exponent is calculated via:
+//   1. float_as_uint(x) - reinterpret float as uint32
+//   2. right_shift(bits, 23) - extract exponent field
+//   3. (exponent_raw & 0xFF) - 127 - adjust IEEE754 bias
+//   4. +1 if mantissa is non-zero (for ceil behavior)
+struct SimplifyPowerCeilLog2BitOpLdexpfMutator : public ir::ExprMutator<> {
+  void operator()(Expr* expr) { ir::ExprMutator<>::Visit(expr, expr); }
+
+  using ir::ExprMutator<>::Visit;
+  void Visit(const ir::Call* op, Expr* expr) override {
+    /// 1. First recursively process all sub-expressions
+    std::vector<Expr> new_args;
+    for (const auto& arg : op->read_args) {
+      Expr new_arg = arg;
+      Visit(&new_arg, &new_arg);
+      new_args.push_back(new_arg);
+    }
+
+    // 2. Match target pattern: pow(base, ceil(log2(x)))
+    if (op->name == "pow" && new_args.size() == 2) {
+      const Expr& base = new_args[0];
+      const Expr& exponent = new_args[1];
+
+      // Check if exponent is ceil(log2(x))
+      if (const ir::Call* ceil_call = exponent.As<ir::Call>()) {
+        if (ceil_call->name == "ceil" && ceil_call->read_args.size() == 1) {
+          if (const ir::Call* log2_call =
+                  ceil_call->read_args[0].As<ir::Call>()) {
+            if (log2_call->name == "log2" && log2_call->read_args.size() == 1 &&
+                log2_call->read_args[0].type().is_float(32)) {
+              /// Verify base is 2.0f for optimization
+              bool is_base_two = false;
+              if (base.is_constant()) {
+                if (base.get_constant() == 2.0f) {
+                  is_base_two = true;
+                }
+              }
+              if (is_base_two) {
+                // 3. Replace with bit operations + ldexpf
+                Expr x = log2_call->read_args[0];  // Extract log2's argument
+
+                // Create bit operations to compute ceil(log2(x))
+                // (1) Reinterpret float as 32-bit integer
+                Expr bits = ir::Call::Make(common::Int(32),
+                                           "__float_as_uint",
+                                           {x},
+                                           {},
+                                           ir::CallType::Extern,
+                                           ir::FunctionRef(),
+                                           0,
+                                           {});
+
+                std::vector<cinn::ir::Expr> shift_r_args = {bits, ir::Expr(23)};
+                Expr shift_r = ir::Call::Make(common::Int(32),
+                                              "right_shift",
+                                              shift_r_args,
+                                              {},
+                                              ir::CallType::Extern,
+                                              ir::FunctionRef(),
+                                              0,
+                                              {});
+                // (2) Extract exponent part: ((bits >> 23) & 0xFF) - 127
+                std::vector<cinn::ir::Expr> bitwise_and_exp_args = {
+                    shift_r, ir::Expr(0xFF)};
+                Expr bitwise_and_exp = ir::Call::Make(common::Int(32),
+                                                      "bitwise_and",
+                                                      bitwise_and_exp_args,
+                                                      {},
+                                                      ir::CallType::Extern,
+                                                      ir::FunctionRef(),
+                                                      0,
+                                                      {});
+                Expr exponent_raw =
+                    ir::Sub::Make(bitwise_and_exp, ir::Expr(127));
+                // 3. Check if mantissa is non-zero (i.e., if exponent+1 is
+                // needed)
+                std::vector<cinn::ir::Expr> bitwise_and_tail_args = {
+                    bits, ir::Expr(0x007FFFFF)};
+                Expr bitwise_and_tail = ir::Call::Make(common::Int(32),
+                                                       "bitwise_and",
+                                                       bitwise_and_tail_args,
+                                                       {},
+                                                       ir::CallType::Extern,
+                                                       ir::FunctionRef(),
+                                                       0,
+                                                       {});
+                Expr mantissa_non_zero =
+                    ir::NE::Make(bitwise_and_tail, ir::Expr(0));
+                // (4) Check if it's a normal number (exponent != -127)
+                Expr is_normal = ir::NE::Make(exponent_raw, ir::Expr(-127));
+                // (5) If needed, exponent += 1
+                Expr exponent_final = ir::Add::Make(
+                    exponent_raw,
+                    ir::Select::Make(
+                        ir::And::Make(is_normal, mantissa_non_zero),
+                        ir::Expr(1),
+                        ir::Expr(0)));
+                // (6) Create final expression: ldexpf(1.0f, exponent_final)
+                Expr new_expr = ir::Call::Make(op->type(),
+                                               "ldexpf",
+                                               {ir::Expr(1.0f), exponent_final},
+                                               {},
+                                               ir::CallType::Extern,
+                                               ir::FunctionRef(),
+                                               0,
+                                               {});
+                *expr = new_expr;
+                return;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // For non-target patterns, reconstruct as-is
+    if (new_args != op->read_args) {
+      *expr = ir::Call::Make(op->type(),
+                             op->name,
+                             new_args,
+                             op->write_args,
+                             op->call_type,
+                             op->func,
+                             op->value_index,
+                             op->attrs);
+    }
+  }
+};
+
 struct SimplifyUnitBlockMutator : public ir::ExprMutator<> {
   void operator()(Expr* x) { ir::ExprMutator<ir::Expr*>::Visit(x, x); }
 
@@ -498,6 +788,8 @@ void Simplify(Expr* expr) {
   SimplifyLogicalMutator()(expr);
   SimplifyIfThenElseMutator()(expr);
   SimplifySelectMutator()(expr);
+  SimplifySelect2MinMaxMutator()(expr);
+  SimplifyPowerCeilLog2BitOpLdexpfMutator()(expr);
   SimplifyNoPureMathMutator()(expr);
   VLOG(6) << "End Simplify " << *expr;
 }
diff --git a/paddle/cinn/optim/simplify_util.cc b/paddle/cinn/optim/simplify_util.cc
index 0c02ff5ce9bb89..5fa37a3ccc3d01 100644
--- a/paddle/cinn/optim/simplify_util.cc
+++ b/paddle/cinn/optim/simplify_util.cc
@@ -677,8 +677,124 @@ std::optional<std::unordered_map<std::string, ir::IndexExpr>> MatchPattern(
   return std::nullopt;
 }
 
+/*!
+ * \brief Optimize linear division and modulo operations with constant
+ * denominators.
+ *
+ * This function handles linear expressions of the form
+ *   `(a * C1 + b) / C2` and `(a * C1 + b) % C2`
+ * where C1 and C2 are constants. It specifically targets:
+ * 1. Linear combinations in the numerator (sums of terms)
+ * 2. Constant denominators
+ *
+ * The optimization:
+ * 1. Separates terms divisible by the denominator (linear coefficients)
+ * 2. Groups remaining terms as a remainder expression
+ * 3. For division:
+ *    - Returns the sum of divisible terms if remainder < denominator
+ *    - Otherwise preserves the original division
+ * 4. For modulo:
+ *    - Returns the remainder if it's provably smaller than denominator
+ *    - Otherwise preserves the original modulo
+ *
+ * Example linear optimizations:
+ * 1. Linear division: (x * 8 + y * 4 + 3) / 4 → x*2 + y + 0 (when 3 < 4)
+ * 2. Linear modulo: (x * 8 + y * 4 + 3) % 4 → 0 + 0 + 3
+ * 3. Partial division: (x * 6 + 5) / 3 → x * 2 + 5 / 3 (when 5 >= 3)
+ *
+ * \param expr The linear division/modulo expression to optimize
+ * \param ana Symbolic analyzer for proving expression bounds
+ * \return Simplified expression if provably correct, original otherwise
+ */
+ir::IndexExpr HandleDivModWithConstants(
+    const ir::IndexExpr &expr, const common::SymbolicExprAnalyzer &ana) {
+  // Get numerator and denominator
+  auto numerator = expr.operand(0);
+  auto denominator = expr.operand(1);
+
+  // Check if denominator is a constant
+  if (!denominator.is_constant()) {
+    return expr;
+  }
+  int64_t denom_val = denominator.as_int64();
+
+  // Recursively expand addition chain and collect all terms
+  std::vector<ir::IndexExpr> terms = optim::GetFlattenExprs<ir::Add>(numerator);
+  if (terms.empty()) {
+    return expr;
+  }
+
+  // Separate terms that are multiples of denominator from other terms
+  std::vector<ir::IndexExpr> multiple_terms;
+  std::vector<ir::IndexExpr> remainder_terms;
+
+  for (auto &term : terms) {
+    if (term.node_type() == ir::IrNodeTy::Mul) {
+      auto rhs = term.operand(1);
+      if (rhs.is_constant() && rhs.as_int64() % denom_val == 0) {
+        // Extract terms divisible by denominator
+        multiple_terms.push_back(
+            term.operand(0) *
+            (rhs.as_int64() / denom_val));  // Extract multiplicand part
+        continue;
+      }
+    }
+    // Extract terms not divisible by denominator
+    auto remainder_upper = ana.UpperBound(term);
+    if (!ana.ProveLT(remainder_upper, denominator).value_or(false)) {
+      return expr;
+    }
+    remainder_terms.push_back(term);
+  }
+
+  // Build remainder expression
+  ir::IndexExpr remainder_expr;
+  if (remainder_terms.empty()) {
+    remainder_expr = ir::IndexExpr(0);
+  } else if (remainder_terms.size() == 1) {
+    remainder_expr = remainder_terms[0];
+  } else {
+    remainder_expr = ir::Add::Make(remainder_terms[0], remainder_terms[1]);
+    for (size_t i = 2; i < remainder_terms.size(); ++i) {
+      remainder_expr = ir::Add::Make(remainder_expr, remainder_terms[i]);
+    }
+  }
+
+  // Build multiplicand terms expression
+  ir::IndexExpr multiple_expr;
+  if (multiple_terms.empty()) {
+    multiple_expr = ir::IndexExpr(0);
+  } else if (multiple_terms.size() == 1) {
+    multiple_expr = multiple_terms[0];
+  } else {
+    multiple_expr = ir::Add::Make(multiple_terms[0], multiple_terms[1]);
+    for (size_t i = 2; i < multiple_terms.size(); ++i) {
+      multiple_expr = ir::Add::Make(multiple_expr, multiple_terms[i]);
+    }
+  }
+
+  // Verify if remainder range is less than denominator
+  auto remainder_upper = ana.UpperBound(remainder_expr);
+  if (!ana.ProveLT(remainder_upper, denominator).value_or(false)) {
+    // If remainder is greater than denominator, the division result is non-zero
+    if (expr.node_type() == ir::IrNodeTy::Div) {
+      return ir::Add::Make(multiple_expr,
+                           ir::Div::Make(remainder_expr, denominator));
+    } else {  // Modulo operation
+      return ir::Mod::Make(remainder_expr, denominator);
+    }
+  } else {
+    // If remainder is less than denominator, the division result is zero
+    if (expr.node_type() == ir::IrNodeTy::Div) {
+      return multiple_expr;
+    } else {  // Modulo operation
+      return remainder_expr;
+    }
+  }
+}
+
 ir::IndexExpr BoundSimplify(const ir::IndexExpr &expr) {
-  // return expr if expr is not a division or modulo
+  // Return expr if expr is not a division or modulo
   if (expr.node_type() != ir::IrNodeTy::Div &&
       expr.node_type() != ir::IrNodeTy::Mod)
     return expr;
@@ -686,10 +802,10 @@ ir::IndexExpr BoundSimplify(const ir::IndexExpr &expr) {
   common::cas_intervals_t var_intervals =
       common::CollectVarIntervalsOfExprs({expr});
   common::SymbolicExprAnalyzer ana(var_intervals);
-  // Because the SymbolicExprAnalyzer bound result is [lower, upper), `ProveLE`
-  // is used here instead of `ProveLT`.
+  // Because the SymbolicExprAnalyzer bound result is [lower, upper],
+  // `ProveLT` is used here instead of `ProveLE`.
   auto canBeSimplified =
-      ana.ProveLE(ana.UpperBound(expr.operand(0)), expr.operand(1));
+      ana.ProveLT(ana.UpperBound(expr.operand(0)), expr.operand(1));
 
   if (canBeSimplified.value_or(false)) {
     if (expr.node_type() == ir::IrNodeTy::Div) {
@@ -698,7 +814,8 @@ ir::IndexExpr BoundSimplify(const ir::IndexExpr &expr) {
       return expr.operand(0);
     }
   }
-  return expr;
+
+  return HandleDivModWithConstants(expr, ana);
 }
 
 ir::IndexExpr BroadcastSimplify(const ir::IndexExpr &expr) {
diff --git a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
index 7962c933db0721..1ca95efbd68678 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
+++ b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
@@ -19,21 +19,29 @@
 #include "paddle/phi/core/distributed/auto_parallel/placement_types.h"
 #include "paddle/phi/core/distributed/auto_parallel/process_mesh.h"
 
-paddle::Tensor add_n_ad_func(const std::vector<paddle::Tensor>& x);
+paddle::Tensor add_n_ad_func(
+    const std::vector<paddle::Tensor>& x,
+    paddle::optional<paddle::Tensor*> input_out = paddle::none);
 
-paddle::Tensor conv2d_ad_func(const paddle::Tensor& input,
-                              const paddle::Tensor& filter,
-                              std::vector<int> strides,
-                              std::vector<int> paddings,
-                              std::string padding_algorithm,
-                              std::vector<int> dilations,
-                              int groups,
-                              std::string data_format);
+paddle::Tensor conv2d_ad_func(
+    const paddle::Tensor& input,
+    const paddle::Tensor& filter,
+    std::vector<int> strides,
+    std::vector<int> paddings,
+    std::string padding_algorithm,
+    std::vector<int> dilations,
+    int groups,
+    std::string data_format,
+    paddle::optional<paddle::Tensor*> input_out = paddle::none);
 
-paddle::Tensor multiply_ad_func(const paddle::Tensor& x,
-                                const paddle::Tensor& y);
-paddle::Tensor& multiply__ad_func(paddle::Tensor& x,  // NOLINT
-                                  const paddle::Tensor& y);
+paddle::Tensor multiply_ad_func(
+    const paddle::Tensor& x,
+    const paddle::Tensor& y,
+    paddle::optional<paddle::Tensor*> input_out = paddle::none);
+paddle::Tensor& multiply__ad_func(
+    paddle::Tensor& x,  // NOLINT
+    const paddle::Tensor& y,
+    paddle::optional<paddle::Tensor*> input_out = paddle::none);
 
 std::tuple<paddle::Tensor,
            paddle::Tensor&,
@@ -55,17 +63,20 @@ sync_batch_norm__ad_func(const paddle::Tensor& x,
 
 paddle::Tensor reshard_ad_function(
     const paddle::Tensor& tensor,
-    const phi::distributed::TensorDistAttr dist_attr);
+    const phi::distributed::TensorDistAttr dist_attr,
+    paddle::optional<paddle::Tensor*> input_out = paddle::none);
 
 paddle::Tensor dtensor_to_local_ad_function(
     const paddle::Tensor& input,
     const phi::distributed::ProcessMesh& processmesh,
-    const phi::distributed::Placements& placements);
+    const phi::distributed::Placements& placements,
+    paddle::optional<paddle::Tensor*> input_out = paddle::none);
 
 paddle::Tensor dtensor_from_local_ad_function(
     const paddle::Tensor& input,
     const phi::distributed::ProcessMesh& processmesh,
-    const phi::distributed::Placements& placements);
+    const phi::distributed::Placements& placements,
+    paddle::optional<paddle::Tensor*> input_out = paddle::none);
 
 namespace sparse {
 std::tuple<paddle::Tensor,
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
index c73063e8b44857..c44a3a080a25ad 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
@@ -23,7 +23,8 @@
 COMMON_DECLARE_bool(check_nan_inf);
 COMMON_DECLARE_bool(check_cuda_error);
 
-paddle::Tensor add_n_ad_func(const std::vector<paddle::Tensor>& x) {
+paddle::Tensor add_n_ad_func(const std::vector<paddle::Tensor>& x,
+                             paddle::optional<paddle::Tensor*> input_out) {
   VLOG(3) << "Running AD API: "
           << "add_n";
   if (FLAGS_check_cuda_error) [[unlikely]] {
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
index 893249fddc904a..0aea3ba196798f 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
@@ -31,7 +31,8 @@ paddle::Tensor conv2d_ad_func(const paddle::Tensor& input,
                               std::string padding_algorithm,
                               std::vector<int> dilations,
                               int groups,
-                              std::string data_format) {
+                              std::string data_format,
+                              paddle::optional<paddle::Tensor*> input_out) {
   VLOG(3) << "Running AD API: "
           << "conv2d";
   if (FLAGS_check_cuda_error) [[unlikely]] {
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/dtensor_from_local_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/dtensor_from_local_fwd_func.cc
index 8fa3b0a11a3cfd..4a06c524dc194d 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/dtensor_from_local_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/dtensor_from_local_fwd_func.cc
@@ -25,7 +25,8 @@ COMMON_DECLARE_bool(check_cuda_error);
 paddle::Tensor dtensor_from_local_ad_function(
     const paddle::Tensor& input,
     const phi::distributed::ProcessMesh& process_mesh,
-    const phi::distributed::Placements& placements) {
+    const phi::distributed::Placements& placements,
+    paddle::optional<paddle::Tensor*> input_out) {
 #ifdef PADDLE_WITH_DISTRIBUTE
   VLOG(3) << "Running AD API: "
           << "dtensor_from_local dygraph";
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/dtensor_to_local_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/dtensor_to_local_fwd_func.cc
index 02d8f368e37953..be18aea8abd79d 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/dtensor_to_local_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/dtensor_to_local_fwd_func.cc
@@ -23,7 +23,8 @@ COMMON_DECLARE_bool(check_cuda_error);
 paddle::Tensor dtensor_to_local_ad_function(
     const paddle::Tensor& input,
     const phi::distributed::ProcessMesh& process_mesh,
-    const phi::distributed::Placements& placements) {
+    const phi::distributed::Placements& placements,
+    paddle::optional<paddle::Tensor*> input_out) {
 #ifdef PADDLE_WITH_DISTRIBUTE
   VLOG(3) << "Running AD API: "
           << "dtensor_to_local dygraph";
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
index def887365f3246..4c03ee6ef486b1 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
@@ -38,7 +38,8 @@ bool check_if_support_elementwise_mul_mem_opt(const std::string& device_type) {
 }
 
 paddle::Tensor multiply_ad_func(const paddle::Tensor& x,
-                                const paddle::Tensor& y) {
+                                const paddle::Tensor& y,
+                                paddle::optional<paddle::Tensor*> input_out) {
   FLAGS_tensor_operants_mode = "eager";
   VLOG(3) << "Running AD API: "
           << "multiply";
@@ -241,7 +242,8 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x,
 }
 
 paddle::Tensor& multiply__ad_func(paddle::Tensor& x,  // NOLINT
-                                  const paddle::Tensor& y) {
+                                  const paddle::Tensor& y,
+                                  paddle::optional<paddle::Tensor*> input_out) {
   FLAGS_tensor_operants_mode = "eager";
   VLOG(3) << "Running AD API: "
           << "multiply_";
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/reshard_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/reshard_fwd_func.cc
index 5aaf63d6c8c411..c048a4248c3184 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/reshard_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/reshard_fwd_func.cc
@@ -22,7 +22,8 @@ COMMON_DECLARE_bool(check_cuda_error);
 
 paddle::Tensor reshard_ad_function(
     const paddle::Tensor& input,
-    const phi::distributed::TensorDistAttr dist_attr) {
+    const phi::distributed::TensorDistAttr dist_attr,
+    paddle::optional<paddle::Tensor*> input_out) {
 #ifdef PADDLE_WITH_DISTRIBUTE
   VLOG(3) << "Running AD API: "
           << "reshard dygraph";
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 6cc1d9e8ba2b48..ee95ac3da7d3a7 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -660,6 +660,7 @@ class {} : public egr::GradNodeBase {{
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/utils/test_macros.h"
 #include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h"
+#include "paddle/utils/optional.h"
 using CPUPlace = phi::CPUPlace;
 {}
 {}
@@ -1496,7 +1497,7 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
 
         self.grad_node_out_list = grad_node_out_list
 
-    def run(self):
+    def run(self, append_input_out=False):
         # Basic Validation Check
         self.DygraphYamlValidationCheck()
 
@@ -1684,7 +1685,9 @@ def GenerateForwardLayoutAutotune(
 
         return layout_logic_str
 
-    def GenerateForwardDefinitionAndDeclaration(self, is_inplaced, grad_flag):
+    def GenerateForwardDefinitionAndDeclaration(
+        self, is_inplaced, grad_flag, append_input_out
+    ):
         namespace = self.namespace
         if self.forward_api_name[-1] == '_' and not is_inplaced:
             return
@@ -1881,6 +1884,24 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced, grad_flag):
 
         inputs_args_declaration_str = ", ".join(inputs_args_declaration_list)
         inputs_args_definition_str = ", ".join(inputs_args_definition_list)
+        if (
+            append_input_out
+            and not grad_flag
+            and not is_inplaced
+            and len(self.forward_outputs_position_map) == 1
+            and next(iter(self.forward_outputs_position_map.values()))[0]
+            == "Tensor"
+            and forward_api_name != "empty_like"
+        ):
+            inputs_args_declaration_str = (
+                inputs_args_declaration_str
+                + ", paddle::optional<paddle::Tensor*> input_out = paddle::none"
+            )
+            inputs_args_definition_str = (
+                inputs_args_definition_str
+                + ", paddle::optional<paddle::Tensor*> input_out"
+            )
+            inputs_call_list.append("input_out")
         inputs_call_args_str = ", ".join(inputs_call_list)
         self.inputs_call_list = inputs_call_list
 
@@ -2135,6 +2156,16 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced, grad_flag):
             + "    ".join(amp_autocast_optional_list)
         )
         amp_inputs_call_args_str = ", ".join(amp_inputs_call_list)
+        if (
+            append_input_out
+            and not grad_flag
+            and not is_inplaced
+            and len(self.forward_outputs_position_map) == 1
+            and next(iter(self.forward_outputs_position_map.values()))[0]
+            == "Tensor"
+            and forward_api_name != "empty_like"
+        ):
+            amp_inputs_call_args_str = amp_inputs_call_args_str + ", input_out"
         amp_call_str = (
             f"return {forward_ad_function_name}({amp_inputs_call_args_str});"
         )
@@ -2158,6 +2189,18 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced, grad_flag):
             type_promote_inputs_call_args_str = ", ".join(
                 type_promote_inputs_call_list
             )
+            if (
+                append_input_out
+                and not grad_flag
+                and not is_inplaced
+                and len(self.forward_outputs_position_map) == 1
+                and next(iter(self.forward_outputs_position_map.values()))[0]
+                == "Tensor"
+                and forward_api_name != "empty_like"
+            ):
+                type_promote_inputs_call_args_str = (
+                    type_promote_inputs_call_args_str + ", input_out"
+                )
             type_promote_call_list = f"return {forward_ad_function_name}({type_promote_inputs_call_args_str});"
 
             x_cast = (
@@ -2180,6 +2223,19 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced, grad_flag):
             type_promote_inputs_call_args_str = ", ".join(
                 type_promote_inputs_call_list
             )
+            if (
+                append_input_out
+                and not grad_flag
+                and not is_inplaced
+                and len(self.forward_outputs_position_map) == 1
+                and next(iter(self.forward_outputs_position_map.values()))[0]
+                == "Tensor"
+                and forward_api_name != "empty_like"
+            ):
+                type_promote_inputs_call_args_str = (
+                    type_promote_inputs_call_args_str + ", input_out"
+                )
+
             type_promote_call_list = f"return {forward_ad_function_name}({type_promote_inputs_call_args_str});"
 
             x_cast = (
@@ -2323,7 +2379,9 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced, grad_flag):
 
         self.forward_declaration_str += f"TEST_API {returns_type_str} {forward_ad_function_name}({inputs_args_declaration_str});\n"
 
-    def GenerateInplacedForwardDygraphFunctions(self, grad_flag):
+    def GenerateInplacedForwardDygraphFunctions(
+        self, grad_flag, append_input_out
+    ):
         # Inplaced Version Dygraph Function Generation
         forward_api_name = self.forward_api_name
         forward_api_contents = self.forward_api_contents
@@ -2331,7 +2389,9 @@ def GenerateInplacedForwardDygraphFunctions(self, grad_flag):
         if forward_api_name != "sum" and "inplace" in forward_api_contents:
             # Function Definition and Declaration Generation
             self.GenerateForwardDefinitionAndDeclaration(
-                is_inplaced=True, grad_flag=grad_flag
+                is_inplaced=True,
+                grad_flag=grad_flag,
+                append_input_out=append_input_out,
             )
             self.UpdateCoreOpsInformation(is_inplaced=True)
 
@@ -2367,8 +2427,8 @@ def UpdateCoreOpsInformation(self, is_inplaced):
         for name, (ttype, pos) in forward_outputs_position_map.items():
             core_ops_returns_info[fwd_api_name][pos] = name
 
-    def run(self, grad_flag=False):
-        super().run()
+    def run(self, grad_flag=False, append_input_out=False):
+        super().run(append_input_out=append_input_out)
 
         ###################
         # Code Generation #
@@ -2376,12 +2436,16 @@ def run(self, grad_flag=False):
 
         # Definition And Declaration
         self.GenerateForwardDefinitionAndDeclaration(
-            is_inplaced=False, grad_flag=grad_flag
+            is_inplaced=False,
+            grad_flag=grad_flag,
+            append_input_out=append_input_out,
         )
 
         self.UpdateCoreOpsInformation(is_inplaced=False)
 
-        self.GenerateInplacedForwardDygraphFunctions(grad_flag)
+        self.GenerateInplacedForwardDygraphFunctions(
+            grad_flag, append_input_out=append_input_out
+        )
 
 
 class DygraphNodeGenerator(DygraphFunctionGeneratorBase):
@@ -3214,8 +3278,8 @@ def _gen_api_call_code_block(
             returns_str,
         )
 
-    def run(self):
-        super().run()
+    def run(self, append_input_out=False):
+        super().run(append_input_out=append_input_out)
 
         self.ResetOptionalInputs()
 
@@ -3299,7 +3363,7 @@ def GetBackwardAPIContents(self, forward_api_contents):
 
         return backward_api_contents
 
-    def GenerateCode(self, grad_flag=False):
+    def GenerateCode(self, grad_flag=False, append_input_out=True):
         if grad_flag:
             op_string = 'backward_op'
         else:
@@ -3347,7 +3411,9 @@ def GenerateCode(self, grad_flag=False):
                     forward_apis_dict,
                     namespace,
                 )
-                function_generator.run(grad_flag)
+                function_generator.run(
+                    grad_flag, append_input_out=append_input_out
+                )
 
                 self.forward_definition_str += (
                     function_generator.forward_definition_str + "\n"
@@ -3372,7 +3438,7 @@ def GenerateCode(self, grad_flag=False):
                         namespace,
                         next_grad_api_contents,
                     )
-                    node_generator.run()
+                    node_generator.run(append_input_out=append_input_out)
                     self.node_declaration_str += (
                         node_generator.node_declaration_str + "\n"
                     )
@@ -3407,12 +3473,12 @@ def GenerateCode(self, grad_flag=False):
                 namespace, self.node_definition_str
             )
 
-    def run(self, grad_flag=False):
+    def run(self, grad_flag=False, append_input_out=False):
         self.ParseYamlContents()
 
         self.InferNameSpace()
 
-        self.GenerateCode(grad_flag)
+        self.GenerateCode(grad_flag, append_input_out=append_input_out)
 
 
 ################
@@ -3521,7 +3587,10 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str, grad_flag):
             generator = DygraphForwardAndNodesGenerator(
                 api_yaml_path, backward_yaml_path
             )
-        generator.run()
+        append_input_out = (
+            "string" not in api_yaml_path and "sparse" not in api_yaml_path
+        )
+        generator.run(append_input_out=append_input_out)
 
         node_declaration_str += generator.node_declaration_str + "\n"
         node_definition_str += generator.node_definition_str + "\n"
@@ -3556,7 +3625,7 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str, grad_flag):
                 backward_yaml_path, backward_yaml_path
             )
 
-        generator_grad.run(True)
+        generator_grad.run(True, append_input_out=False)
 
         backward_declaration_str += (
             generator_grad.forward_declaration_str + "\n"
diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
index dc05025ee8d6d6..661427fd069bab 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
@@ -126,6 +126,8 @@ def FindParsingFunctionFromAttributeType(atype):
     // Get EagerTensors from args
 {}
     // Parse Attributes if needed
+{}
+    // Parse input_out if needed
 {}
     tstate = PyEval_SaveThread();
 
@@ -335,7 +337,7 @@ def CollectIsForwardOnly(self):
             False if 'backward' in forward_api_contents.keys() else True
         )
 
-    def GeneratePythonCFunction(self):
+    def GeneratePythonCFunction(self, no_input_out_tensor=False):
         namespace = self.namespace
         forward_inplace_map = self.forward_inplace_map
         forward_api_name = self.forward_api_name
@@ -498,6 +500,22 @@ def GeneratePythonCFunction(self):
             dygraph_function_call_list[pos] = f"{name}"
         dygraph_function_call_str = ",".join(dygraph_function_call_list)
 
+        get_input_out_str = ""
+        if (
+            not no_input_out_tensor
+            and not forward_inplace_map
+            and len(self.forward_outputs_position_map) == 1
+            and next(iter(self.forward_outputs_position_map.values()))[0]
+            == "Tensor"
+            and forward_api_name != "empty_like"
+        ):
+            dygraph_function_call_str = (
+                dygraph_function_call_str + ", input_out"
+            )
+            get_input_out_str = (
+                "    auto input_out = GetInputOutTensorFromKwargs(kwargs);"
+            )
+
         # Generate Python-C Function Definitions
         fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
             "::", namespace, GetForwardFunctionName(forward_api_name)
@@ -524,6 +542,7 @@ def GeneratePythonCFunction(self):
             forward_api_name,
             get_eager_tensor_str,
             parse_attributes_str,
+            get_input_out_str,
             set_device_str,
             noamp_dygraph_function_str,
             return_str,
@@ -581,6 +600,7 @@ def GeneratePythonCFunction(self):
                 inplaced_forward_api_name,
                 get_eager_tensor_str,
                 parse_attributes_str,
+                "",
                 set_device_str,
                 inplace_noamp_dygraph_function_str,
                 return_str,
@@ -618,7 +638,7 @@ def GeneratePythonCFunction(self):
                 # Generate Python-C Function Registration
                 self.python_c_function_reg_str += python_c_inplace_func_reg_str
 
-    def run(self):
+    def run(self, no_input_out_tensor=False):
         # Initialized is_forward_only
         self.CollectIsForwardOnly()
 
@@ -640,7 +660,7 @@ def run(self):
         )
 
         # Code Generation
-        self.GeneratePythonCFunction()
+        self.GeneratePythonCFunction(no_input_out_tensor)
 
         return True
 
@@ -658,7 +678,7 @@ def __init__(self, path):
         self.python_c_functions_reg_str = ""
         self.python_c_function_declare_str = ""
 
-    def GeneratePythonCFunctions(self):
+    def GeneratePythonCFunctions(self, no_input_out_tensor=False):
         namespace = self.namespace
 
         forward_api_list = self.forward_api_list
@@ -670,7 +690,7 @@ def GeneratePythonCFunctions(self):
             f_generator = PythonCSingleFunctionGenerator(
                 forward_api_content, namespace
             )
-            status = f_generator.run()
+            status = f_generator.run(no_input_out_tensor)
 
             if status:
                 self.python_c_functions_str += (
@@ -698,7 +718,7 @@ def AttachNamespace(self):
                 )
             )
 
-    def run(self):
+    def run(self, no_input_out_tensor=False):
         # Infer namespace from yaml_path
         self.InferNameSpace()
 
@@ -706,7 +726,7 @@ def run(self):
         self.ParseForwardYamlContents()
 
         # Code Generation
-        self.GeneratePythonCFunctions()
+        self.GeneratePythonCFunctions(no_input_out_tensor)
 
         # Wrap with namespace
         self.AttachNamespace()
@@ -763,8 +783,14 @@ def GeneratePythonCFile(filepath, python_c_str):
     for i in range(len(api_yaml_paths)):
         api_yaml_path = api_yaml_paths[i]
 
+        no_input_out_tensor = (
+            "backward" in api_yaml_path
+            or "strings" in api_yaml_path
+            or "sparse" in api_yaml_path
+        )
+
         py_c_generator = PythonCGenerator(api_yaml_path)
-        py_c_generator.run()
+        py_c_generator.run(no_input_out_tensor)
 
         generated_python_c_functions += (
             py_c_generator.python_c_functions_str + "\n"
diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc
index 78e9db9e9b8d68..8d594c10392fb1 100644
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
@@ -29,14 +29,14 @@ void SetOp(ProgramDesc* prog,
            const std::string& name,
            const std::vector<std::string>& inputs,
            const std::vector<std::string>& outputs,
-           bool use_mkldnn = false,
+           bool use_onednn = false,
            ISTEST_STATE is_test = ISTEST_STATE::UNSET) {
   auto* op = prog->MutableBlock(0)->AppendOp();
   op->SetType(type);
   op->SetAttr("name", name);
   op->SetInput("X", inputs);
   op->SetOutput("Out", outputs);
-  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("use_onednn", use_onednn);
   if (is_test == ISTEST_STATE::UNSET)
     op->MutableAttrMap()->erase("is_test");
   else if (is_test == ISTEST_STATE::FALSE)
diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
index 20f130cb37208e..f03d7a160e1048 100644
--- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
@@ -57,7 +57,7 @@ bool validateReduceOpAttrs(const Node* node,
     EXPECT_TRUE(
         !PADDLE_GET_CONST(bool, op->GetAttr("reduce_all")),
         ::paddle::string::Sprintf(
-            "The LayerNorm fusion %s"
+            "The LayerNorm fusion %s "
             "reduction must have \'reduce_all\' attribute set to false.",
             name));
   }
diff --git a/paddle/fluid/framework/ir/onednn/activation_onednn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/activation_onednn_fuse_pass.h
index cfd4875c73bf3e..f8f0056ff5829f 100644
--- a/paddle/fluid/framework/ir/onednn/activation_onednn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/onednn/activation_onednn_fuse_pass.h
@@ -59,12 +59,21 @@ inline std::unordered_map<std::string, std::string> GetAttributeMap(
 inline void SetActivationAttrs(paddle::framework::OpDesc* fused_op,
                                paddle::framework::OpDesc* act_op,
                                const std::string& act_type) {
-  if (fused_op->HasAttr("use_mkldnn")) {
+  bool use_mkldnn = false;
+  if (fused_op->HasAttr("use_mkldnn") && !fused_op->HasAttr("use_onednn")) {
     PADDLE_ENFORCE(PADDLE_GET_CONST(bool, fused_op->GetAttr("use_mkldnn")),
                    common::errors::PreconditionNotMet(
-                       "oneDNN activation fuses require use_mkldnn=True"));
+                       "oneDNN activation fuses require use_onednn=True"));
+  }
+  if (fused_op->HasAttr("use_mkldnn")) {
+    use_mkldnn = PADDLE_GET_CONST(bool, fused_op->GetAttr("use_mkldnn"));
+  }
+  if (!use_mkldnn && fused_op->HasAttr("use_onednn")) {
+    PADDLE_ENFORCE(PADDLE_GET_CONST(bool, fused_op->GetAttr("use_onednn")),
+                   common::errors::PreconditionNotMet(
+                       "oneDNN activation fuses require use_onednn=True"));
   }
-  fused_op->SetAttr("use_mkldnn", true);
+  fused_op->SetAttr("use_onednn", true);
 
   auto attr_map = GetAttributeMap(act_type);
   for (const auto& attr : attr_map) {
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 5d504c71ff1033..fa0df97f219b27 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -1420,7 +1420,7 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
       if (dtype != -1 && dtype != 2 && dtype != 3 && dtype != 5 && dtype != 6) {
         VLOG(3)
-            << "the fill_any_like only supports int32/int64/float32/float64 by"
+            << "the fill_any_like only supports int32/int64/float32/float64 by "
                "trt8.4 below";
         return false;
       }
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cc b/paddle/fluid/operators/collective/c_embedding_op.cc
index 2c228e5a17775c..182ace60aa7fce 100644
--- a/paddle/fluid/operators/collective/c_embedding_op.cc
+++ b/paddle/fluid/operators/collective/c_embedding_op.cc
@@ -87,7 +87,7 @@ class CEmbeddingOpMaker : public framework::OpProtoAndCheckerMaker {
                      "and the out-of-bounds will be set to 0 ")
         .SetDefault(0);
     AddAttr<int64_t>("vocab_size",
-                     "(int64, default -1), The total vocabulary size to check"
+                     "(int64, default -1), The total vocabulary size to check "
                      "the out-of-bounds ids. If it is -1, no check will be ")
         .SetDefault(-1);
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/fused/fused_adam_op.cc b/paddle/fluid/operators/fused/fused_adam_op.cc
index 932bdbfd90a6c2..7a890e3e961503 100644
--- a/paddle/fluid/operators/fused/fused_adam_op.cc
+++ b/paddle/fluid/operators/fused/fused_adam_op.cc
@@ -115,7 +115,7 @@ class FusedAdamOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(0);
     AddAttr<bool>("use_adamw",
                   "(bool, default False) "
-                  "Whether to use AdamW"
+                  "Whether to use AdamW. "
                   "True for decoupled weight decay")
         .SetDefault(false);
     AddAttr<bool>("multi_precision",
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index c69e9f98497391..fc58a32ef7c0aa 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -124,9 +124,9 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(y_dim.size(),
                         2,
                         common::errors::InvalidArgument(
-                            "The dimensions of qkv_weight must be 2 if enable"
-                            "transpose_qkv_wb: (dim_embed, 3 * dim_embed),"
-                            "but received dimensions of"
+                            "The dimensions of qkv_weight must be 2 if enable "
+                            "transpose_qkv_wb: (dim_embed, 3 * dim_embed), "
+                            "but received dimensions of "
                             "Input is [%d]",
                             y_dim.size()));
       PADDLE_ENFORCE_GT(num_heads,
@@ -159,7 +159,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(y_dim.size(),
                         4,
                         common::errors::InvalidArgument(
-                            "The dimensions of qkv_weight must be 4 if not"
+                            "The dimensions of qkv_weight must be 4 if not "
                             "enable transpose_qkv_wb: (3, num_head, dim_head, "
                             "dim_embed), but received [%d]",
                             y_dim.size()));
@@ -186,8 +186,8 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
         x_dim.size(),
         3,
         common::errors::InvalidArgument("The dimensions of x must be 3"
-                                        "(batch_size, seq_len, dim_embed),"
-                                        "but received dimensions of"
+                                        "(batch_size, seq_len, dim_embed), "
+                                        "but received dimensions of "
                                         "Input is [%d]",
                                         x_dim.size()));
 
@@ -431,7 +431,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
         "attn_dropout_implementation",
         "[\"downgrade_in_infer\"|\"upscale_in_train\"]"
         "There are two kinds of ways to implement dropout"
-        "(the mask below is a tensor have the same shape with input"
+        "(the mask below is a tensor have the same shape with input, "
         "the value of mask is 0 or 1, the ratio of 0 is dropout_rate)"
         "1. downgrade_in_infer(default), downgrade the outcome at inference "
         "time"
diff --git a/paddle/fluid/operators/fused/fused_conv2d_op.cc b/paddle/fluid/operators/fused/fused_conv2d_op.cc
index 04d2d4043bf966..fb7bb428ef24ba 100644
--- a/paddle/fluid/operators/fused/fused_conv2d_op.cc
+++ b/paddle/fluid/operators/fused/fused_conv2d_op.cc
@@ -53,13 +53,13 @@ TODO: Documentation of conv2d op.
  protected:
   void Apply() {
     AddInput("Bias",
-             "(Tensor) Bias to be added to each output of filter application."
-             "The format of output tensor is X (one-dimensional) of size equal"
+             "(Tensor) Bias to be added to each output of filter application. "
+             "The format of output tensor is X (one-dimensional) of size equal "
              "to the number of output channels. Only used with MKL-DNN.")
         .AsDispensable();
     AddInput("ResidualData",
              "(Tensor) Tensor with residual data "
-             "to which convolution output will be added."
+             "to which convolution output will be added. "
              "Used with fuse_residual_connection fusion.")
         .AsDispensable();
     AddAttr<std::string>("fuse_activation",
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
index 28a87239f37693..c4a1ce652c905d 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
@@ -72,16 +72,16 @@ class FusedMultiTransformerINT8Op : public framework::OperatorWithKernel {
         x_dim.size(),
         3,
         common::errors::InvalidArgument("The dimensions of x must be 3"
-                                        "(batch_size, seq_len, dim_embed),"
-                                        "but received dimensions of"
+                                        "(batch_size, seq_len, dim_embed), "
+                                        "but received dimensions of "
                                         "Input is [%d]",
                                         x_dim.size()));
     PADDLE_ENFORCE_EQ(
         y_dim.size(),
         4,
         common::errors::InvalidArgument("The dimensions of qkv_weight must be 4"
-                                        "(3, num_head, dim_head, dim_embed),"
-                                        "but received dimensions of"
+                                        "(3, num_head, dim_head, dim_embed), "
+                                        "but received dimensions of "
                                         "Input is [%d]",
                                         y_dim.size()));
     PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc
index 93c688d149ac77..d2a262e2bac763 100644
--- a/paddle/fluid/operators/fused/multi_gru_op.cc
+++ b/paddle/fluid/operators/fused/multi_gru_op.cc
@@ -64,7 +64,7 @@ void MultiGRUOp::InferShape(framework::InferShapeContext* ctx) const {
         wx_dims[i][0],
         x_mat_dims[1],
         common::errors::InvalidArgument(
-            "The first dimension of flattened WeightX #%d"
+            "The first dimension of flattened WeightX #%d "
             "should equal to last dimension of flattened input X, but "
             "received fattened WeightX dimension is:%d, flattened X dimension "
             "is:%d",
@@ -205,7 +205,7 @@ void MultiGRUOpMaker::Make() {
                "Number of stacked GRU layers.")
       .SetDefault(1);
   AddAttr<bool>("origin_mode",
-                "bool"
+                "bool "
                 "use origin mode in article https://arxiv.org/abs/1412.3555")
       .SetDefault(false);
   AddAttr<std::string>(
diff --git a/paddle/fluid/operators/memcpy_d2h_op.cc b/paddle/fluid/operators/memcpy_d2h_op.cc
index 8033cdb6489016..771c0ff19ce2c8 100644
--- a/paddle/fluid/operators/memcpy_d2h_op.cc
+++ b/paddle/fluid/operators/memcpy_d2h_op.cc
@@ -62,12 +62,13 @@ class MemcpyD2HOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out",
               "(phi::DenseTensor) The type of output "
               "is the same as input X.");
-    AddAttr<int>("dst_place_type",
-                 "Determine the dst place of tensor copy. "
-                 "By Now it ONLY support XPU/CUDAPlace <-> CUDAPinnedPlace/CPU"
-                 "Other place type is Unimplemented and will cause ERROR."
-                 "0: dst is on CPUPlace. "
-                 "1: dst is on CUDAPinnedPlace. ");
+    AddAttr<int>(
+        "dst_place_type",
+        "Determine the dst place of tensor copy. "
+        "By Now it ONLY support XPU/CUDAPlace <-> CUDAPinnedPlace/CPU. "
+        "Other place type is Unimplemented and will cause ERROR. "
+        "0: dst is on CPUPlace. "
+        "1: dst is on CUDAPinnedPlace. ");
     AddComment(R"DOC(
     MemcpyD2H Operator.
     By now, it ONLY supports the memcopy between CUDAPlace <-> CUDAPinnedPlace/CPU.
diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc
index a65758f5ecf8a8..5a01de461429a2 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.cc
+++ b/paddle/fluid/operators/memcpy_h2d_op.cc
@@ -64,10 +64,10 @@ class MemcpyH2DOpProtoMaker : public framework::OpProtoAndCheckerMaker {
               "is the same as input X.");
     AddAttr<int>("dst_place_type",
                  "Determine the dst place of tensor copy. "
-                 "By Now it support:"
-                 "0. CUDAPinnedPlace/CPU <->CUDAPlace"
-                 "1. CPU <->XPUPlace"
-                 "2. CPU <->IPUPlace"
+                 "By Now it support: "
+                 "0. CUDAPinnedPlace/CPU <->CUDAPlace. "
+                 "1. CPU <->XPUPlace. "
+                 "2. CPU <->IPUPlace. "
                  "Other place type is Unimplemented and will cause ERROR.");
     AddComment(R"DOC(
     MemcpyD2H Operator.
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
index 88737990847f34..cef735b1fdac82 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
@@ -50,7 +50,7 @@ class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The input tensor of sequence_mask op.");
     AddOutput("Y", "The output mask of sequence_mask op.");
     AddInput("MaxLenTensor",
-             "Max length tensor"
+             "Max length tensor "
              "have higher priority than maxlen attribute")
         .AsDispensable();
     AddAttr<int>("maxlen",
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 79a93bfca9e9e6..8fde85928e4070 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -127,8 +127,8 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput(
         "X",
-        "A Variable list. The shape and data type of the list elements"
-        "should be consistent. Variable can be multi-dimensional Tensor"
+        "A Variable list. The shape and data type of the list elements "
+        "should be consistent. Variable can be multi-dimensional Tensor "
         "or phi::DenseTensor, and data types can be: float32, float64, int32, "
         "int64.")
         .AsDuplicable();
diff --git a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
index c426d3325a0811..0c70fbb72f98d3 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
@@ -134,6 +134,7 @@
     'KthvalueInferMeta',
     'MaxPoolWithIndexInferMeta',
     'MaxPoolV2InferMeta',
+    'MinMaxWithIndexInferMeta',
     'MultinomialInferMeta',
     'OverlapAddInferMeta',
     'PadInferMeta',
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
index 6750759633d0b8..9bf285da4d77a9 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
@@ -315,19 +315,37 @@ bool AnyOpInferSymbolicShape(pir::Operation *op,
                                  axis.size() == 0 /*reduce_all*/);
 }
 
-bool ArgmaxOpInferSymbolicShape(pir::Operation *op,
-                                pir::InferSymbolicShapeContext *infer_context) {
+bool MinMaxOpInferSymbolicShape(pir::Operation *op,
+                                pir::InferSymbolicShapeContext *infer_context,
+                                bool output_val_and_ind = false) {
   bool flatten = GetBoolAttr(op, "flatten");
-  bool keepdims = GetBoolAttr(op, "keepdims");
+  bool keepdims = false;
+  int axis = 0;
+
+  if (output_val_and_ind) {
+    keepdims = GetBoolAttr(op, "keepdim");
 
+    PADDLE_ENFORCE_NE(
+        op->attributes().find("dim"),
+        op->attributes().end(),
+        common::errors::InvalidArgument(
+            "'dim' Attribute is expected for Min/MaxWithIndexOp. "));
+    axis = op->attributes()
+               .at("dim")
+               .dyn_cast<paddle::dialect::ScalarAttribute>()
+               .data()
+               .to<int64_t>();
+  } else {
+    keepdims = GetBoolAttr(op, "keepdims");
+    const auto &axis_shape_or_data =
+        infer_context->GetShapeOrDataForValue(op->operand_source(1));
+    axis = static_cast<int>(
+        axis_shape_or_data.data().value().at(0).Get<int64_t>());
+  }
   const auto &input_sym_shape =
       infer_context->GetShapeOrDataForValue(op->operand_source(0)).shape();
-  int rank = input_sym_shape.size();
 
-  const auto &axis_shape_or_data =
-      infer_context->GetShapeOrDataForValue(op->operand_source(1));
-  int axis =
-      static_cast<int>(axis_shape_or_data.data().value().at(0).Get<int64_t>());
+  int rank = input_sym_shape.size();
   if (axis < 0) axis += rank;
 
   const auto &out_sym_shape = [&] {
@@ -357,14 +375,31 @@ bool ArgmaxOpInferSymbolicShape(pir::Operation *op,
       symbol::TensorShapeOrDataDimExprs(out_sym_shape)};
 
   infer_context->SetShapeOrDataForValue(op->result(0), shape_data);
+  if (output_val_and_ind)
+    infer_context->SetShapeOrDataForValue(op->result(1), shape_data);
   return true;
 }
 
+#define DEFINE_MINMAX_OP_INFER_FUNC(OpName, output_val_and_ind)               \
+  bool OpName##OpInferSymbolicShape(                                          \
+      pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) {    \
+    return MinMaxOpInferSymbolicShape(op, infer_context, output_val_and_ind); \
+  }
+
+DEFINE_MINMAX_OP_INFER_FUNC(Argmax, false)
+DEFINE_MINMAX_OP_INFER_FUNC(MaxWithIndex, true)
+#undef DEFINE_MINMAX_OP_INFER_FUNC
+
 bool ArgminOpInferSymbolicShape(pir::Operation *op,
                                 pir::InferSymbolicShapeContext *infer_context) {
   return ArgmaxOpInferSymbolicShape(op, infer_context);
 }
 
+bool MinWithIndexOpInferSymbolicShape(
+    pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) {
+  return MaxWithIndexOpInferSymbolicShape(op, infer_context);
+}
+
 bool AsComplexOpInferSymbolicShape(
     pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) {
   pir::Value operand_source = op->operand_source(0);
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h
index 9868d08d8a290d..8d21b51eb2719f 100755
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h
@@ -93,8 +93,10 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(Lu)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Lu_)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Mode)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Max)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(MaxWithIndex)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Maxout)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Min)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(MinWithIndex)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Mean)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(MeanAll)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(MatrixPower)
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index d2af764fc392d7..1067c4e6854e3b 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -901,7 +901,7 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
       true,
       common::errors::PreconditionNotMet(
           "Could not parse args and kwargs successfully, "
-          "please check your input first and make"
+          "please check your input first and make "
           "sure you are on the right way. "
           "The expected arguments as follow: ("
           "value, place, persistable, zero_copy, "
@@ -1307,7 +1307,7 @@ int StringTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
                     true,
                     common::errors::PreconditionNotMet(
                         "Could not parse args and kwargs successfully, "
-                        "please check your input first and make"
+                        "please check your input first and make "
                         "sure you are on the right way. "
                         "The expected arguments as follow: ("
                         "value, zero_copy, name, dims)"));
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 15b78262ef8e0b..b5e4bb3e82a6bc 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -1353,8 +1353,8 @@ static PyObject* eager_api_set_master_grads(PyObject* self,
     PADDLE_ENFORCE_NE(
         grad,
         nullptr,
-        common::errors::Fatal("Detected nullptr grad"
-                              "Please check if you have manually cleared"
+        common::errors::Fatal("Detected nullptr grad. "
+                              "Please check if you have manually cleared "
                               "the grad inside autograd_meta"));
     if (((*grad).has_allocation() || (*grad).is_dist_tensor()) &&
         ((*grad).dtype() == phi::DataType::FLOAT16 ||
diff --git a/paddle/fluid/pybind/eager_generator.cc b/paddle/fluid/pybind/eager_generator.cc
index 0ecd4c6263c1bf..e6b8e0ccb86bba 100644
--- a/paddle/fluid/pybind/eager_generator.cc
+++ b/paddle/fluid/pybind/eager_generator.cc
@@ -502,6 +502,7 @@ static void SlotNameMatching(
                 grad_fwd_slotname_map[grad_slot_name] != fwd_slot_name) {
               PADDLE_THROW(common::errors::Fatal(
                   "Detected mismatched slot names."
+                  "Detected mismatched slot names: "
                   "grad_slot_name %s matches both %s and %s fwd_slot_name",
                   grad_slot_name,
                   grad_fwd_slotname_map[grad_slot_name],
@@ -536,7 +537,7 @@ static void SlotNameMatching(
             if (grad_fwd_slotname_map.count(grad_slot_name) &&
                 grad_fwd_slotname_map[grad_slot_name] != fwd_slot_name) {
               PADDLE_THROW(common::errors::Fatal(
-                  "Detected mismatched slot names"
+                  "Detected mismatched slot names: "
                   "grad_slot_name %s matches both %s and %s fwd_slot_name",
                   grad_slot_name,
                   grad_fwd_slotname_map[grad_slot_name],
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 8af90c243833d3..2aa7606619bb4b 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -931,8 +931,8 @@ static PyObject* tensor_clear_gradient(TensorObject* self,
     grad = egr::EagerUtils::mutable_grad(self->tensor);
     PADDLE_ENFORCE(
         grad != nullptr,
-        common::errors::Fatal("Detected nullptr grad"
-                              "Please check if you have manually cleared"
+        common::errors::Fatal("Detected nullptr grad. "
+                              "Please check if you have manually cleared "
                               "the grad inside autograd_meta"));
   } else {
     auto meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor);
@@ -995,8 +995,8 @@ static PyObject* tensor__zero_grads(TensorObject* self,
     paddle::Tensor* grad = egr::EagerUtils::mutable_grad(self->tensor);
     PADDLE_ENFORCE(
         grad != nullptr,
-        common::errors::Fatal("Detected nullptr grad"
-                              "Please check if you have manually cleared"
+        common::errors::Fatal("Detected nullptr grad. "
+                              "Please check if you have manually cleared "
                               "the grad inside autograd_meta"));
     if (grad->initialized()) {
       if (grad->is_dense_tensor() || grad->is_dist_tensor()) {
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index cd0d67efcd4439..d89c8eb8418e52 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -311,8 +311,8 @@ int tensor_properties_set_grad(TensorObject* self,
   paddle::Tensor* grad = egr::EagerUtils::mutable_grad(self->tensor);
   PADDLE_ENFORCE(
       grad != nullptr,
-      common::errors::Fatal("Detected NULL grad"
-                            "Please check if you have manually cleared"
+      common::errors::Fatal("Detected NULL grad. "
+                            "Please check if you have manually cleared "
                             "the grad inside autograd_meta"));
   const phi::distributed::ProcessMesh* mesh = nullptr;
   if (InputsContainDistTensor(&mesh, src, self->tensor, *grad)) {
@@ -334,8 +334,8 @@ int tensor_properties_set_grad_(TensorObject* self,
   paddle::Tensor* grad = egr::EagerUtils::mutable_grad(self->tensor);
   PADDLE_ENFORCE(
       grad != nullptr,
-      common::errors::Fatal("Detected NULL grad"
-                            "Please check if you have manually cleared"
+      common::errors::Fatal("Detected NULL grad. "
+                            "Please check if you have manually cleared "
                             "the grad inside autograd_meta"));
   *grad = src;
   return 0;
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index cddb8c4e90bc16..4319540cacdaf9 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -3097,4 +3097,16 @@ void EagerSetDeviceId() {
   }
 }
 
+paddle::optional<Tensor*> GetInputOutTensorFromKwargs(PyObject* kwargs) {
+  if (!kwargs) {
+    return paddle::none;
+  }
+  PyObject* obj = PyDict_GetItemString(kwargs, "out");
+  if (obj && PyObject_TypeCheck(obj, p_tensor_type)) {
+    return paddle::make_optional<paddle::Tensor*>(
+        &(reinterpret_cast<TensorObject*>(obj)->tensor));
+  }
+  return paddle::none;
+}
+
 }  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index e0a1c035b353d5..95d4ac9fd2424c 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -514,5 +514,7 @@ CvtPlacements(phi::distributed::Placements placements, int ndim);
 
 void EagerSetDeviceId();
 
+paddle::optional<Tensor*> GetInputOutTensorFromKwargs(PyObject* kwargs);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index eefe6bf8e4e0ea..c7869861793036 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -737,7 +737,7 @@ void BindImperative(py::module *m_ptr) {
             } else {
               PADDLE_THROW(common::errors::InvalidArgument(
                   "Incompatible Place Type: supports XPUPlace, CUDAPlace, "
-                  "CPUPlace, IPUPlace, XPUPinnedPlace"
+                  "CPUPlace, IPUPlace, XPUPinnedPlace "
                   "and CUDAPinnedPlace, "
                   "but got Unknown Type!"));
             }
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index 288a5bc95f18dd..cb73b45fa4cb0f 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -337,7 +337,7 @@ void PruneWithInput(const std::vector<pir::Value> &input_vars,
       if (!input_vars_set.empty() && SomeInSet(op_results, input_vars_set)) {
         PADDLE_THROW(common::errors::InvalidArgument(
             "The input_var create by: '{%s}' is not involved in the "
-            "output_vars calculation"
+            "output_vars calculation. "
             "Please remove it from input_vars.",
             op->name()));
       }
diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
index 1e00d51799fad9..9872001ece2ec6 100644
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -342,9 +342,9 @@ void BindPlace(pybind11::module &m) {  // NOLINT
              }
 #else
              LOG(ERROR) << string::Sprintf(
-                 "Cannot use CustomDevice because you have installed CPU/GPU"
+                 "Cannot use CustomDevice because you have installed CPU/GPU "
                  "version PaddlePaddle.\n"
-                 "If you want to use CustomDevice, please try to install"
+                 "If you want to use CustomDevice, please try to install "
                  "CustomDevice version "
                  "PaddlePaddle by: pip install paddlepaddle\n"
                  "If you only have CPU, please change "
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c95f73763ca956..07bf166122bbb1 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2596,7 +2596,7 @@ All parameter, weight, gradient are variables in Paddle.
           VLOG(1) << string::Sprintf(
               "Cannot use get_all_device_type because you have installed "
               "CPU/GPU version PaddlePaddle.\n"
-              "If you want to use get_all_device_type, please try to install"
+              "If you want to use get_all_device_type, please try to install "
               "CustomDevice version "
               "PaddlePaddle by: pip install paddlepaddle\n");
 #endif
@@ -2624,7 +2624,7 @@ All parameter, weight, gradient are variables in Paddle.
           VLOG(1) << string::Sprintf(
               "Cannot use get_available_device because you have installed "
               "CPU/GPU version PaddlePaddle.\n"
-              "If you want to use get_available_device, please try to install"
+              "If you want to use get_available_device, please try to install "
               "CustomDevice version "
               "PaddlePaddle by: pip install paddlepaddle\n");
 #endif
@@ -2639,7 +2639,7 @@ All parameter, weight, gradient are variables in Paddle.
               "Cannot use get_available_custom_device because you have "
               "installed CPU/GPU version PaddlePaddle.\n"
               "If you want to use get_available_custom_device, please try to "
-              "install"
+              "install "
               "CustomDevice version "
               "PaddlePaddle by: pip install paddlepaddle\n");
 #endif
@@ -2657,7 +2657,7 @@ All parameter, weight, gradient are variables in Paddle.
               "Cannot use get_custom_device_count because you have "
               "installed CPU/GPU version PaddlePaddle.\n"
               "If you want to use get_custom_device_count, please try to "
-              "install"
+              "install "
               "CustomDevice version "
               "PaddlePaddle by: pip install paddlepaddle\n");
 #endif
diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h
index 4be2fe7a31976d..73f62793dd55f3 100644
--- a/paddle/fluid/pybind/slice_utils.h
+++ b/paddle/fluid/pybind/slice_utils.h
@@ -529,7 +529,7 @@ static void ParseIndex(const paddle::Tensor& tensor,
             PADDLE_ENFORCE_EQ(slice_tensor.shape()[i],
                               dim_len,
                               common::errors::OutOfRange(
-                                  "The shape of boolean index %d did not match"
+                                  "The shape of boolean index %d did not match "
                                   "indexed tensor %d along axis %d.",
                                   slice_tensor.shape()[0],
                                   dim_len,
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index fe7ce761c9cbff..102316279b0970 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -512,7 +512,7 @@ void SetTensorFromPyArrayT(
     } else {
       PADDLE_THROW(common::errors::InvalidArgument(
           "Incompatible place type: Tensor.set() supports "
-          "CPUPlace, CUDAPlace"
+          "CPUPlace, CUDAPlace "
           "and CUDAPinnedPlace, but got %s!",
           place));
     }
diff --git a/paddle/phi/api/generator/api_base.py b/paddle/phi/api/generator/api_base.py
index dc96c9e3c68353..708ae750c747dd 100644
--- a/paddle/phi/api/generator/api_base.py
+++ b/paddle/phi/api/generator/api_base.py
@@ -238,7 +238,9 @@ def get_grad_output(self, inplace_flag):
         else:
             return f"""std::make_tuple({", ".join(args)})"""
 
-    def get_declare_args(self, inplace_flag=False):
+    def get_declare_args(
+        self, inplace_flag=False, grad_flag=False, append_input_out=False
+    ):
         declare_args = self.get_input_tensor_args(inplace_flag)
         for name in self.attrs['names']:
             default_value = ''
@@ -248,13 +250,37 @@ def get_declare_args(self, inplace_flag=False):
                 self.attrs['attr_info'][name][0] + ' ' + name + default_value
             )
 
+        if (
+            not grad_flag
+            and not inplace_flag
+            and append_input_out
+            and len(self.outputs['names']) == 1
+            and self.outputs['types'][0] == "Tensor"
+            and self.api != "empty_like"
+        ):
+            declare_args.append(
+                "paddle::optional<Tensor*> input_out = paddle::none"
+            )
+
         return ", ".join(declare_args)
 
-    def get_define_args(self, inplace_flag=False):
+    def get_define_args(
+        self, inplace_flag=False, grad_flag=False, append_input_out=True
+    ):
         define_args = self.get_input_tensor_args(inplace_flag)
         for name in self.attrs['names']:
             define_args.append(self.attrs['attr_info'][name][0] + ' ' + name)
 
+        if (
+            not grad_flag
+            and not inplace_flag
+            and append_input_out
+            and len(self.outputs['names']) == 1
+            and self.outputs['types'][0] == "Tensor"
+            and self.api != "empty_like"
+        ):
+            define_args.append("paddle::optional<Tensor*> input_out")
+
         return ", ".join(define_args)
 
     def parse_args(self, api_name, api_item_yaml):
@@ -518,12 +544,12 @@ def parse_data_transform(self, api_item_yaml):
     def get_return_type(self, inplace_flag=False):
         return None
 
-    def gene_api_declaration(self):
+    def gene_api_declaration(self, grad_flag=False, append_input_out=True):
         api_declaration = ""
         api_func_name = self.get_api_func_name()
         if api_func_name[-1] != '_':
             api_declaration = f"""
-PADDLE_API {self.get_return_type()} {api_func_name}({self.get_declare_args()});
+PADDLE_API {self.get_return_type()} {api_func_name}({self.get_declare_args(grad_flag=grad_flag, append_input_out=append_input_out)});
 """
 
         if self.is_base_api and len(self.inplace_map) > 0:
@@ -532,7 +558,7 @@ def gene_api_declaration(self):
             api_declaration = (
                 api_declaration
                 + f"""
-PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True)});
+PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True, grad_flag=grad_flag, append_input_out=append_input_out)});
 """
             )
 
@@ -1572,7 +1598,7 @@ def gene_invoke_code(self, invoke_code, params_code):
   return {invoke_code};
 }}"""
 
-    def gene_api_code(self):
+    def gene_api_code(self, grad_flag=False, append_input_out=True):
         if self.is_base_api:
             api_code = self.gene_base_api_code()
             if len(self.inplace_map) > 0:
@@ -1585,5 +1611,7 @@ def gene_api_code(self):
             return ''
         else:
             invoke_code = self.invoke
-            params_code = self.get_define_args()
+            params_code = self.get_define_args(
+                grad_flag=grad_flag, append_input_out=append_input_out
+            )
             return self.gene_invoke_code(invoke_code, params_code)
diff --git a/paddle/phi/api/generator/api_gen.py b/paddle/phi/api/generator/api_gen.py
index 68b06b022381e2..363371854a7128 100644
--- a/paddle/phi/api/generator/api_gen.py
+++ b/paddle/phi/api/generator/api_gen.py
@@ -217,7 +217,20 @@ def gene_output(
                 if inplace_flag and self.outputs['names'][0] in self.inplace_map
                 else ""
             )
-            output_create = f"""
+            if (
+                len(self.outputs['names']) == 1
+                and self.outputs['types'][0] == "Tensor"
+                and not (
+                    inplace_flag
+                    and self.outputs['names'][0].split('@')[0]
+                    in self.inplace_map
+                )
+                and self.api != "empty_like"
+            ):
+                output_create = f"""
+{code_indent}  Tensor out_tmp; Tensor& api_output = input_out ? **input_out : out_tmp;"""
+            else:
+                output_create = f"""
 {code_indent}  {return_type} api_output{inplace_assign};"""
             set_out_func = (
                 'SetKernelOutput'
@@ -416,14 +429,16 @@ def reset_view_after_fallback(
 
 class BackwardAPI(ForwardAPI):
 
-    def gene_base_api_code(self, inplace_flag=False):
+    def gene_base_api_code(
+        self, inplace_flag=False, grad_flag=False, append_input_out=True
+    ):
         api_func_name = self.get_api_func_name()
         if inplace_flag and api_func_name[-1] != '_':
             inplace_name = api_func_name + '_'
         else:
             inplace_name = api_func_name
         api_code = f"""
-PADDLE_API {self.get_return_type(inplace_flag)} {inplace_name}({self.get_define_args(inplace_flag)}) {{
+PADDLE_API {self.get_return_type(inplace_flag)} {inplace_name}({self.get_define_args(inplace_flag, grad_flag=grad_flag, append_input_out=append_input_out)}) {{
 {self.get_grad_outputs_define(inplace_flag)}
 {self.get_optional_inputs_change(inplace_flag)}
     {api_func_name}({self.get_grad_api_call_args(inplace_flag)});
@@ -432,7 +447,7 @@ def gene_base_api_code(self, inplace_flag=False):
 """
         return api_code
 
-    def gene_api_code(self):
+    def gene_api_code(self, grad_flag=False, append_input_out=False):
         if not self.is_base_api and not self.is_only_composite_api:
             invoke_func_name = self.invoke.split('(')[0]
             if (not invoke_func_name.endswith("_grad")) and (
@@ -443,14 +458,17 @@ def gene_api_code(self):
         if self.is_only_composite_api:
             return ""
 
-        api_code = self.gene_base_api_code()
+        api_code = self.gene_base_api_code(
+            grad_flag=grad_flag, append_input_out=append_input_out
+        )
         if self.is_base_api and len(self.inplace_map) > 0:
             if self.api[-1] == '_':
                 api_code = ""
             api_code = api_code + self.gene_base_api_code_for_inplace()
+
         return api_code
 
-    def gene_api_declaration(self):
+    def gene_api_declaration(self, grad_flag=False, append_input_out=True):
         if not self.is_base_api and not self.is_only_composite_api:
             invoke_func_name = self.invoke.split('(')[0]
             if (not invoke_func_name.endswith("_grad")) and (
@@ -465,7 +483,7 @@ def gene_api_declaration(self):
         api_func_name = self.get_api_func_name()
         if api_func_name[-1] != '_':
             api_declaration = f"""
-PADDLE_API {self.get_return_type()} {api_func_name}({self.get_declare_args()});
+PADDLE_API {self.get_return_type()} {api_func_name}({self.get_declare_args(append_input_out=append_input_out)});
 """
 
         if self.is_base_api and len(self.inplace_map) > 0:
@@ -474,7 +492,7 @@ def gene_api_declaration(self):
             api_declaration = (
                 api_declaration
                 + f"""
-PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True)});
+PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True,append_input_out=append_input_out)});
 """
             )
 
@@ -633,12 +651,20 @@ def generate_api(
 
         if forward_api.is_dygraph_api and is_fused_ops_yaml:
             forward_api.is_dygraph_api = False
-            header_file.write(forward_api.gene_api_declaration())
-            source_file.write(forward_api.gene_api_code())
+            header_file.write(
+                forward_api.gene_api_declaration(
+                    grad_flag=grad_flag, append_input_out=not grad_flag
+                )
+            )
+            source_file.write(forward_api.gene_api_code(grad_flag=grad_flag))
             forward_api.is_dygraph_api = True
 
-        header_file.write(forward_api.gene_api_declaration())
-        source_file.write(forward_api.gene_api_code())
+        header_file.write(
+            forward_api.gene_api_declaration(
+                grad_flag=grad_flag, append_input_out=not grad_flag
+            )
+        )
+        source_file.write(forward_api.gene_api_code(grad_flag=grad_flag))
 
     header_file.write(namespace[1])
     source_file.write(namespace[1])
diff --git a/paddle/phi/api/generator/backward_api_gen.py b/paddle/phi/api/generator/backward_api_gen.py
index 320209d7483b3d..86d491460d5cf9 100644
--- a/paddle/phi/api/generator/backward_api_gen.py
+++ b/paddle/phi/api/generator/backward_api_gen.py
@@ -89,15 +89,23 @@ def check_args(self, forward_config):
         ), f"{self.api} : Output error: The number of outputs should be less then the number of inputs of forward api. \
              Please check the output of {self.api} in yaml."
 
-    def get_declare_args(self, inplace_flag=False):
-        return self.get_define_args()
+    def get_declare_args(
+        self, inplace_flag=False, grad_flag=False, append_input_out=False
+    ):
+        return self.get_define_args(
+            grad_flag=grad_flag, append_input_out=append_input_out
+        )
 
-    def get_define_args(self, inplace_flag=False):
+    def get_define_args(
+        self, inplace_flag=False, grad_flag=False, append_input_out=False
+    ):
         out_type_map = {
             'Tensor': 'Tensor*',
             'std::vector<Tensor>': 'std::vector<Tensor*>',
         }
-        inputs_and_attrs = super().get_define_args()
+        inputs_and_attrs = super().get_define_args(
+            grad_flag=grad_flag, append_input_out=False
+        )
         outs = []
         for i, name in enumerate(self.outputs['names']):
             outs.append(
@@ -111,7 +119,7 @@ def get_define_args(self, inplace_flag=False):
     def gene_return_code(self):
         return ""
 
-    def gene_api_declaration(self):
+    def gene_api_declaration(self, grad_flag=False, append_input_out=False):
         if not self.is_base_api and not self.is_only_composite_api:
             invoke_func_name = self.invoke.split('(')[0]
             if (not invoke_func_name.endswith("_grad")) and (
diff --git a/paddle/phi/api/generator/dist_api_gen.py b/paddle/phi/api/generator/dist_api_gen.py
index 72d2afccfafb42..ed47941a61570d 100644
--- a/paddle/phi/api/generator/dist_api_gen.py
+++ b/paddle/phi/api/generator/dist_api_gen.py
@@ -1140,9 +1140,16 @@ def generate_output_creation_code(self) -> str:
                     return_type, inplace_assign_code
                 )
             else:
-                output_creation_code += API_OUT_CREATION_TEMPLATE.format(
-                    return_type, ""
-                )
+                if (
+                    len(self.outputs['names']) == 1
+                    and self.outputs['types'][0] == "Tensor"
+                    and self.api != "empty_like"
+                ):
+                    output_creation_code += "Tensor out_tmp; Tensor& api_output = input_out ? **input_out : out_tmp;"
+                else:
+                    output_creation_code += API_OUT_CREATION_TEMPLATE.format(
+                        return_type, ""
+                    )
             # kernel output generate
             self.dist_output_args.append('dist_out')
             self.dense_output_args.append('dense_out')
@@ -2092,7 +2099,9 @@ def check_argument_whether_support_auto_parallel(self):
         return True
 
     # override BaseAPI's method
-    def gene_base_api_code(self, inplace_flag=False):
+    def gene_base_api_code(
+        self, inplace_flag=False, grad_flag=False, append_input_out=True
+    ):
         # init status
         self.inplace_flag = inplace_flag
         self.dist_output_args = []
@@ -2159,14 +2168,25 @@ def gene_base_api_code(self, inplace_flag=False):
 
 class DistBackwardAPI(DistForwardAPI):
 
-    def gene_base_api_code(self, inplace_flag=False):
-        return BackwardAPI.gene_base_api_code(self, inplace_flag)
+    def gene_base_api_code(
+        self, inplace_flag=False, grad_flag=False, append_input_out=True
+    ):
+        return BackwardAPI.gene_base_api_code(
+            self,
+            inplace_flag,
+            grad_flag=grad_flag,
+            append_input_out=append_input_out,
+        )
 
-    def gene_api_code(self):
-        return BackwardAPI.gene_api_code(self)
+    def gene_api_code(self, grad_flag=False, append_input_out=False):
+        return BackwardAPI.gene_api_code(
+            self, grad_flag=grad_flag, append_input_out=append_input_out
+        )
 
-    def gene_api_declaration(self):
-        return BackwardAPI.gene_api_declaration(self)
+    def gene_api_declaration(self, grad_flag=False, append_input_out=True):
+        return BackwardAPI.gene_api_declaration(
+            self, grad_flag=grad_flag, append_input_out=append_input_out
+        )
 
 
 def generate_api(
@@ -2233,12 +2253,22 @@ def generate_api(
 
         if dist_forward_api.is_dygraph_api and is_fused_ops_yaml:
             dist_forward_api.is_dygraph_api = False
-            header_file.write(dist_forward_api.gene_api_declaration())
-            source_file.write(dist_forward_api.gene_api_code())
+            header_file.write(
+                dist_forward_api.gene_api_declaration(
+                    grad_flag=grad_flag, append_input_out=not grad_flag
+                )
+            )
+            source_file.write(
+                dist_forward_api.gene_api_code(grad_flag=grad_flag)
+            )
             dist_forward_api.is_dygraph_api = True
 
-        header_file.write(dist_forward_api.gene_api_declaration())
-        source_file.write(dist_forward_api.gene_api_code())
+        header_file.write(
+            dist_forward_api.gene_api_declaration(
+                grad_flag=grad_flag, append_input_out=not grad_flag
+            )
+        )
+        source_file.write(dist_forward_api.gene_api_code(grad_flag=grad_flag))
 
     header_file.write(namespace[1])
     source_file.write(namespace[1])
diff --git a/paddle/phi/api/generator/dist_bw_api_gen.py b/paddle/phi/api/generator/dist_bw_api_gen.py
index 2d7abedfb02061..b85e40b59fa80d 100644
--- a/paddle/phi/api/generator/dist_bw_api_gen.py
+++ b/paddle/phi/api/generator/dist_bw_api_gen.py
@@ -417,8 +417,12 @@ def gene_return_code(self):
         return ""
 
     # override BaseAPI's method
-    def gene_api_declaration(self) -> str:
-        return BackwardAPI.gene_api_declaration(self)
+    def gene_api_declaration(
+        self, grad_flag=False, append_input_out=False
+    ) -> str:
+        return BackwardAPI.gene_api_declaration(
+            self, grad_flag=grad_flag, append_input_out=not grad_flag
+        )
 
     def generate_reshard_output_code(self):
         reshard_output_code = ""
diff --git a/paddle/phi/api/generator/sparse_api_gen.py b/paddle/phi/api/generator/sparse_api_gen.py
index 97a8c9994f92fb..019900a9999660 100644
--- a/paddle/phi/api/generator/sparse_api_gen.py
+++ b/paddle/phi/api/generator/sparse_api_gen.py
@@ -23,10 +23,10 @@ class SparseAPI(ForwardAPI):
     def __init__(self, api_item_yaml):
         super().__init__(api_item_yaml)
 
-    def gene_api_declaration(self):
+    def gene_api_declaration(self, grad_flag=False, append_input_out=False):
         return f"""
 // {", ".join(self.outputs['names'])}
-{super().gene_api_declaration()}
+{super().gene_api_declaration(append_input_out=False)}
 """
 
     def gene_output(
@@ -392,7 +392,9 @@ def gene_dispatch_code(self, kernel_name, inplace_flag=False):
   }}
 """
 
-    def gene_base_api_code(self, inplace_flag=False):
+    def gene_base_api_code(
+        self, inplace_flag=False, grad_flag=False, append_input_out=False
+    ):
         api_func_name = self.get_api_func_name()
         if inplace_flag and api_func_name[-1] != '_':
             api_func_name += '_'
@@ -403,7 +405,7 @@ def gene_base_api_code(self, inplace_flag=False):
             )
 
         return f"""
-PADDLE_API {self.get_return_type(inplace_flag)} {api_func_name}({self.get_define_args(inplace_flag)}) {{
+PADDLE_API {self.get_return_type(inplace_flag)} {api_func_name}({self.get_define_args(inplace_flag, grad_flag=grad_flag, append_input_out=False)}) {{
 {kernel_dispatch_code}
   PADDLE_THROW(common::errors::Unimplemented(
           "The kernel of ({self.api}) for input tensors is unimplemented, please check the type of input tensors."));
@@ -468,7 +470,9 @@ def api_namespace():
     )
 
 
-def generate_api(api_yaml_path, header_file_path, source_file_path):
+def generate_api(
+    api_yaml_path, header_file_path, source_file_path, grad_flag=False
+):
     apis = []
 
     for each_api_yaml in api_yaml_path:
@@ -496,8 +500,16 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
             continue
         if sparse_api.is_dygraph_api:
             sparse_api.is_dygraph_api = False
-        header_file.write(sparse_api.gene_api_declaration())
-        source_file.write(sparse_api.gene_api_code())
+        header_file.write(
+            sparse_api.gene_api_declaration(
+                grad_flag=grad_flag, append_input_out=False
+            )
+        )
+        source_file.write(
+            sparse_api.gene_api_code(
+                grad_flag=grad_flag, append_input_out=False
+            )
+        )
 
     header_file.write(namespace[1])
     source_file.write(namespace[1])
@@ -556,11 +568,14 @@ def main():
     backward_api_yaml_path = options.backward_api_yaml_path
     backward_header_file_path = options.backward_api_header_path
     backward_source_file_path = options.backward_api_source_path
-    generate_api(api_yaml_path, header_file_path, source_file_path)
+    generate_api(
+        api_yaml_path, header_file_path, source_file_path, grad_flag=False
+    )
     generate_api(
         backward_api_yaml_path,
         backward_header_file_path,
         backward_source_file_path,
+        grad_flag=True,
     )
 
 
diff --git a/paddle/phi/api/generator/sparse_bw_api_gen.py b/paddle/phi/api/generator/sparse_bw_api_gen.py
index c5ebdd51f2e1e8..059504de8def02 100644
--- a/paddle/phi/api/generator/sparse_bw_api_gen.py
+++ b/paddle/phi/api/generator/sparse_bw_api_gen.py
@@ -35,14 +35,24 @@ def get_return_type(self, inplace_flag=False):
     def gene_return_code(self):
         return "return;"
 
-    def gene_api_declaration(self):
-        return SparseAPI.gene_api_declaration(self)
+    def gene_api_declaration(self, grad_flag=False, append_input_out=False):
+        return SparseAPI.gene_api_declaration(
+            self, grad_flag=grad_flag, append_input_out=False
+        )
 
-    def get_declare_args(self, inplace_flag=False):
-        return BackwardAPI.get_declare_args(self)
+    def get_declare_args(
+        self, inplace_flag=False, grad_flag=False, append_input_out=False
+    ):
+        return BackwardAPI.get_declare_args(
+            self, grad_flag=grad_flag, append_input_out=False
+        )
 
-    def get_define_args(self, inplace_flag=False):
-        return BackwardAPI.get_define_args(self)
+    def get_define_args(
+        self, inplace_flag=False, grad_flag=False, append_input_out=False
+    ):
+        return BackwardAPI.get_define_args(
+            self, grad_flag=grad_flag, append_input_out=False
+        )
 
     def gene_output(
         self,
@@ -157,7 +167,9 @@ def api_namespace():
     )
 
 
-def generate_api(api_yaml_path, header_file_path, source_file_path):
+def generate_api(
+    api_yaml_path, header_file_path, source_file_path, grad_flag=False
+):
     with open(api_yaml_path, 'r') as f:
         apis = yaml.load(f, Loader=yaml.FullLoader)
     header_file = open(header_file_path, 'w')
@@ -175,8 +187,16 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
 
     for api in apis:
         sparse_bw_api = SparseBackwardAPI(api)
-        header_file.write(sparse_bw_api.gene_api_declaration())
-        source_file.write(sparse_bw_api.gene_api_code())
+        header_file.write(
+            sparse_bw_api.gene_api_declaration(
+                grad_flag=grad_flag, append_input_out=False
+            )
+        )
+        source_file.write(
+            sparse_bw_api.gene_api_code(
+                grad_flag=grad_flag, append_input_out=False
+            )
+        )
 
     header_file.write(namespace[1])
     source_file.write(namespace[1])
@@ -213,7 +233,9 @@ def main():
     header_file_path = options.api_header_path
     source_file_path = options.api_source_path
 
-    generate_api(api_yaml_path, header_file_path, source_file_path)
+    generate_api(
+        api_yaml_path, header_file_path, source_file_path, grad_flag=True
+    )
 
 
 if __name__ == '__main__':
diff --git a/paddle/phi/api/generator/strings_api_gen.py b/paddle/phi/api/generator/strings_api_gen.py
index c22b5a6e87b030..03097c50e5a550 100644
--- a/paddle/phi/api/generator/strings_api_gen.py
+++ b/paddle/phi/api/generator/strings_api_gen.py
@@ -31,7 +31,7 @@ def get_api_func_name(self):
     def gene_api_declaration(self):
         return f"""
 // {", ".join(self.outputs['names'])}
-{super().gene_api_declaration()}
+{super().gene_api_declaration(append_input_out=False)}
 """
 
     def get_kernel_tensor_out_type(self, output_name):
@@ -306,10 +306,12 @@ def gene_kernel_select(self) -> str:
 
         return kernel_select_code
 
-    def gene_base_api_code(self, inplace_flag=False):
+    def gene_base_api_code(
+        self, inplace_flag=False, grad_flag=False, append_input_out=False
+    ):
         api_func_name = self.get_api_func_name()
         return f"""
-PADDLE_API {self.get_return_type(inplace_flag)} {api_func_name}({self.get_define_args(inplace_flag)}) {{
+PADDLE_API {self.get_return_type(inplace_flag)} {api_func_name}({self.get_define_args(inplace_flag, grad_flag=grad_flag, append_input_out=False)}) {{
 {self.gene_kernel_select()}
 {self.gen_string_tensor_kernel_code(inplace_flag)}
 }}
diff --git a/paddle/phi/api/generator/tensor_operants_gen.py b/paddle/phi/api/generator/tensor_operants_gen.py
index ea1184bf0581a6..4b15b84d6f5768 100644
--- a/paddle/phi/api/generator/tensor_operants_gen.py
+++ b/paddle/phi/api/generator/tensor_operants_gen.py
@@ -479,11 +479,11 @@ def gene_operants_base(self):
         api_func_name = self.get_api_func_name()
         if api_func_name[-1] != '_':
             return f"""
-{indent}virtual {self.get_return_type()} {api_func_name}({self.get_declare_args()}) = 0;
+{indent}virtual {self.get_return_type()} {api_func_name}({self.get_declare_args(append_input_out=False)}) = 0;
 """
         else:
             return f"""
-{indent}virtual {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True)}) = 0;
+{indent}virtual {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True, append_input_out=False)}) = 0;
 """
 
     def get_declare_args_without_first_tensor(self, inplace_flag=False):
@@ -553,11 +553,11 @@ def gene_operants_declaration(self):
         api_func_name = self.get_api_func_name()
         if api_func_name[-1] != '_':
             return f"""
-{indent}{self.get_return_type()} {api_func_name}({self.get_declare_args()});
+{indent}{self.get_return_type()} {api_func_name}({self.get_declare_args(append_input_out=False)});
 """
         else:
             return f"""
-{indent}{self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True)});
+{indent}{self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True, append_input_out=False)});
 """
 
     def gene_operants_implementation(self):
@@ -567,13 +567,13 @@ def gene_operants_implementation(self):
         # func declaration
         if func_name[-1] != '_':
             return f"""
-{self.get_return_type()} PhiTensorOperants::{func_name}({self.get_define_args()}) {{
+{self.get_return_type()} PhiTensorOperants::{func_name}({self.get_define_args(append_input_out=False)}) {{
 {indent}return paddle::experimental::{func_name}({func_args_code});
 }}
 """
         else:
             return f"""
-{self.get_return_type(inplace_flag=True)} PhiTensorOperants::{func_name}({self.get_define_args(inplace_flag=True)}) {{
+{self.get_return_type(inplace_flag=True)} PhiTensorOperants::{func_name}({self.get_define_args(inplace_flag=True,append_input_out=False)}) {{
 {indent}return paddle::experimental::{func_name}({func_args_code});
 }}
 
@@ -640,14 +640,14 @@ def gene_operants_manager_implementation(self):
             return (
                 final_code
                 + f"""
-{self.get_return_type()} OperantsManager::{func_name}({self.get_define_args()}) {{{self.gene_operants_manager_code()}}}
+{self.get_return_type()} OperantsManager::{func_name}({self.get_define_args(append_input_out=False)}) {{{self.gene_operants_manager_code()}}}
 """
             )
         else:
             return (
                 final_code
                 + f"""
-{self.get_return_type(inplace_flag=True)} OperantsManager::{func_name}({self.get_define_args(inplace_flag=True)}) {{
+{self.get_return_type(inplace_flag=True)} OperantsManager::{func_name}({self.get_define_args(inplace_flag=True,append_input_out=False)}) {{
 {self.gene_operants_manager_code()}
 }}
 """
diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc
index 2f7d54eaa05e00..5ad401cbddb7b8 100644
--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -34,13 +34,20 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #endif
+#include "paddle/utils/optional.h"
 
 COMMON_DECLARE_bool(use_stride_kernel);
 namespace paddle {
 namespace experimental {
 // declare cast api
-Tensor cast(const Tensor &x, DataType out_dtype);
-Tensor copy_to(const Tensor &x, const Place &place, bool blocking);
+Tensor cast(const Tensor &x,
+DataType out_dtype,
+paddle::optional<Tensor*> input_out = paddle::none);
+
+Tensor copy_to(const Tensor &x,
+const Place &place,
+bool blocking,
+ paddle::optional<Tensor*> input_out = paddle::none);
 }  // namespace experimental
 
 // TODO(chenweihang): Remove this namespace using-directives later
diff --git a/paddle/phi/backends/xpu/xpu_info.cc b/paddle/phi/backends/xpu/xpu_info.cc
index 7b37de7fce0d0c..8b3ff923b1d444 100644
--- a/paddle/phi/backends/xpu/xpu_info.cc
+++ b/paddle/phi/backends/xpu/xpu_info.cc
@@ -40,8 +40,8 @@ PHI_DEFINE_EXPORTED_string(
     "This option is useful when doing multi process training and "
     "each process have only one device (XPU). If you want to use "
     "all visible devices, set this to empty string. NOTE: the "
-    "reason of doing this is that we want to use P2P communication"
-    "between XPU devices, use XPU_VISIBLE_DEVICES can only use"
+    "reason of doing this is that we want to use P2P communication "
+    "between XPU devices, use XPU_VISIBLE_DEVICES can only use "
     "share-memory only.");
 
 namespace phi {
diff --git a/paddle/phi/core/platform/cpu_helper.cc b/paddle/phi/core/platform/cpu_helper.cc
index 751c0a3bd0f934..269d8fd8b6d2b1 100644
--- a/paddle/phi/core/platform/cpu_helper.cc
+++ b/paddle/phi/core/platform/cpu_helper.cc
@@ -51,7 +51,7 @@ void SetNumThreads(int num_threads) {
   return;
 #else
   PADDLE_THROW(common::errors::Unimplemented(
-      "This library (except OPENBLAS, MKLML) is not supported yet, so the"
+      "This library (except OPENBLAS, MKLML) is not supported yet, so the "
       "number of threads cannot be set."));
 #endif
 }
diff --git a/paddle/phi/core/platform/profiler.cc b/paddle/phi/core/platform/profiler.cc
index 993db57c6d90eb..a03f55a3dcf9e6 100644
--- a/paddle/phi/core/platform/profiler.cc
+++ b/paddle/phi/core/platform/profiler.cc
@@ -625,7 +625,7 @@ void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE_NE(state,
                     ProfilerState::kDisabled,
                     common::errors::InvalidArgument(
-                        "Can't enable profiling, since the input state is"
+                        "Can't enable profiling, since the input state is "
                         "ProfilerState::kDisabled"));
   SynchronizeAllDevice();
   std::lock_guard<std::mutex> l(profiler_mu);
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
index e7b6980f3b70bf..3a7e6eb108f1b9 100644
--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -2421,16 +2421,17 @@ void FusedMultiTransformerInt8InferMeta(
 }
 
 void FusedTransposeSplitQuantInferMeta(const MetaTensor& x,
+                                       const MetaTensor& input_scales,
                                        const IntArray& tokens_per_expert,
                                        bool pow_2_scales,
                                        std::vector<MetaTensor*> outs,
                                        std::vector<MetaTensor*> scales) {
   PADDLE_ENFORCE_EQ(
-      x.dtype(),
-      DataType::BFLOAT16,
-      common::errors::InvalidArgument(
-          "The dtype of Input(x) must be BFLOAT16, but received %s",
-          x.dtype()));
+      x.dtype() == DataType::BFLOAT16 || x.dtype() == DataType::FLOAT8_E4M3FN,
+      true,
+      common::errors::InvalidArgument("The dtype of Input(x) must be BFLOAT16 "
+                                      "or FLOAT8_E4M3FN, but received %s",
+                                      x.dtype()));
 
   auto x_dims = x.dims();
 
diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h
index a3e6342b09f0a1..c1f6a988bf59b1 100644
--- a/paddle/phi/infermeta/fusion.h
+++ b/paddle/phi/infermeta/fusion.h
@@ -669,6 +669,7 @@ void FusedMultiTransformerInt8InferMeta(
     MetaTensor* out);
 
 void FusedTransposeSplitQuantInferMeta(const MetaTensor& x,
+                                       const MetaTensor& input_scales,
                                        const IntArray& tokens_per_expert,
                                        bool pow_2_scales,
                                        std::vector<MetaTensor*> outs,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index a30e9fd2f035e4..933edfaa0ea1a3 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -366,6 +366,90 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
   }
 }
 
+void MinMaxWithIndexInferMeta(const MetaTensor& x,
+                              const Scalar& axis,
+                              bool keepdims,
+                              bool flatten,
+                              MetaTensor* val_out,
+                              MetaTensor* ind_out,
+                              MetaConfig config) {
+  DataType val_dtype = x.dtype();
+
+  if (!config.is_runtime && axis.FromTensor()) {
+    std::vector<int64_t> vec;
+    if (flatten) {
+      if (keepdims) {  // NOLINT
+        vec = std::vector<int64_t>(x.dims().size(), -1);
+      } else {
+        vec = {};
+      }
+    } else {
+      if (keepdims) {
+        vec = std::vector<int64_t>(x.dims().size(), -1);
+      } else {
+        vec = std::vector<int64_t>(x.dims().size() - 1, -1);
+      }
+    }
+    val_out->set_dims(common::make_ddim(vec));
+    val_out->set_dtype(val_dtype);
+    ind_out->set_dims(common::make_ddim(vec));
+    ind_out->set_dtype(DataType::INT64);
+    return;
+  }
+  auto int_axis = axis.to<int64_t>();
+  const auto& x_dims = x.dims();
+
+  auto x_rank = x.dims().size();
+  if (x_rank > 0) {
+    PADDLE_ENFORCE_GE(int_axis,
+                      -x_rank,
+                      common::errors::InvalidArgument(
+                          "'axis'(%d) must be greater than or equal to"
+                          " -Rank(X)(%d).",
+                          int_axis,
+                          -x_rank));
+    PADDLE_ENFORCE_LT(
+        int_axis,
+        x_rank,
+        common::errors::InvalidArgument(
+            "'axis'(%d) must be less than Rank(X)(%d) of Input(X).",
+            int_axis,
+            x_rank));
+  } else {
+    // 0-dim tensor
+    PADDLE_ENFORCE_EQ(int_axis == 0 || int_axis == -1,
+                      true,
+                      common::errors::InvalidArgument(
+                          "'axis'(%d) must be 0 or -1 if input tensor is "
+                          "0-dim.",
+                          int_axis));
+  }
+
+  if (int_axis < 0) int_axis += x_rank;
+
+  std::vector<int64_t> vec;
+  if (flatten) {
+    if (keepdims) {  // NOLINT
+      vec = std::vector<int64_t>(x.dims().size(), 1);
+    } else {
+      vec = {};
+    }
+  } else {
+    for (int64_t i = 0; i < int_axis; i++)
+      vec.emplace_back(x_dims[static_cast<int>(i)]);
+    if (keepdims) {
+      vec.emplace_back(static_cast<int64_t>(1));
+    }
+    for (int64_t i = int_axis + 1; i < x_rank; i++)
+      vec.emplace_back(x_dims[static_cast<int>(i)]);
+  }
+
+  val_out->set_dims(common::make_ddim(vec));
+  val_out->set_dtype(val_dtype);
+  ind_out->set_dims(common::make_ddim(vec));
+  ind_out->set_dtype(DataType::INT64);
+}
+
 void ArgsortInferMeta(const MetaTensor& input,
                       int axis,
                       bool descending,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 7334ee476c0ad9..ea6c95748c16c5 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -66,6 +66,14 @@ void ArgMinMaxInferMeta(const MetaTensor& x,
                         MetaTensor* out,
                         MetaConfig config = MetaConfig());
 
+void MinMaxWithIndexInferMeta(const MetaTensor& x,
+                              const Scalar& axis,
+                              bool keepdims,
+                              bool flatten,
+                              MetaTensor* val_out,
+                              MetaTensor* ind_out,
+                              MetaConfig config = MetaConfig());
+
 void ArgsortInferMeta(const MetaTensor& input,
                       int axis,
                       bool descending,
diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h
index 80d7028a3082f9..a4a858ff8eaf8b 100644
--- a/paddle/phi/kernels/autotune/auto_tune_base.h
+++ b/paddle/phi/kernels/autotune/auto_tune_base.h
@@ -63,7 +63,7 @@ class AutoTuneBase {
   }
 
   template <typename Context, typename... Args>
-  void Run(const Context& ctx,
+  void Run(const Context& dev_ctx,
            const AlgorithmType& algo,
            const size_t key,
            Args&&... args) {
@@ -78,7 +78,7 @@ class AutoTuneBase {
       if (use_autotune) {
         // All available kernels have ran while picking the best kernel,
         // so there may be no need for another kernel run.
-        auto best_idx = PickBestKernel(ctx, args...);
+        auto best_idx = PickBestKernel(dev_ctx, args...);
         cache.Set(key, best_idx);
       } else {
         kernels_[0].Run(args...);
@@ -100,14 +100,14 @@ class AutoTuneBase {
   }
 
   template <typename Context, typename... Args>
-  size_t PickBestKernel(const Context& ctx, Args&&... args) {
+  size_t PickBestKernel(const Context& dev_ctx, Args&&... args) {
     std::lock_guard<std::mutex> lock(mutex_);
     size_t best_idx = 0;
     float min_time = std::numeric_limits<float>::max();
 
     // Time cost test established in default stream.
     for (size_t i = 0; i < kernels_.size(); ++i) {
-      auto time = RunAndMeasureKernel<Context>(ctx, i, args...);
+      auto time = RunAndMeasureKernel<Context>(dev_ctx, i, args...);
       if (time < min_time) {
         min_time = time;
         best_idx = i;
@@ -118,15 +118,17 @@ class AutoTuneBase {
   }
 
   template <typename Context, typename... Args>
-  float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) {
+  float RunAndMeasureKernel(const Context& dev_ctx,
+                            const int idx,
+                            Args&&... args) {
     // Regard 1st run as warmup, judge the compare result by the time cost
     // of rest cycles.
     constexpr int repeats = 11;
     phi::GpuTimer timer;
     float time_cost = 0;
-    const auto& stream = ctx.stream();
+    const auto& stream = dev_ctx.stream();
 
-    ctx.Wait();
+    dev_ctx.Wait();
     for (int i = 0; i < repeats; ++i) {
       timer.Start(stream);
       kernels_[idx].Run(args...);
@@ -158,7 +160,7 @@ class MatmulAutoTuner
   }
 
   template <typename Context>
-  void Run(const Context& ctx, const size_t key, Args... args) {
+  void Run(const Context& dev_ctx, const size_t key, Args... args) {
     this->is_init_ = true;
     this->CheckKernelSize();
     auto& cache = AutoTuneCache::Instance().GetMatmul();
@@ -168,7 +170,7 @@ class MatmulAutoTuner
     } else {
       bool use_autotune = AutoTuneStatus::Instance().UseAutoTune();
       if (use_autotune) {
-        auto best_idx = this->PickBestKernel(ctx, args...);
+        auto best_idx = this->PickBestKernel(dev_ctx, args...);
         cache.Set(key, best_idx);
       } else {
         this->kernels_[0].Run(args...);
@@ -210,7 +212,7 @@ class GatherGemmScatterAutoTuner
     return instance.get();
   }
 
-  void Run(const phi::GPUContext& ctx,
+  void Run(const phi::GPUContext& dev_ctx,
            const size_t key,
            T const alpha,
            T const beta,
@@ -227,15 +229,15 @@ class GatherGemmScatterAutoTuner
     } else {
       // Set alpha to 0 and beta to 1 to avoid changing the value of d when
       // picking the best kernel
-      auto best_idx =
-          PickBestKernel(ctx, static_cast<T>(0), static_cast<T>(1), args...);
+      auto best_idx = PickBestKernel(
+          dev_ctx, static_cast<T>(0), static_cast<T>(1), args...);
       cache.Set(key, best_idx);
       this->kernels_[best_idx].Run(alpha, beta, args...);
     }
   }
 
  protected:
-  size_t PickBestKernel(const phi::GPUContext& ctx,
+  size_t PickBestKernel(const phi::GPUContext& dev_ctx,
                         const T& alpha,
                         const T& beta,
                         Args&... args) {
@@ -250,7 +252,7 @@ class GatherGemmScatterAutoTuner
       // Some kernels may require more shared memory than available, skip these
       // kernels.
       try {
-        time = this->RunAndMeasureKernel(ctx, i, alpha, beta, args...);
+        time = this->RunAndMeasureKernel(dev_ctx, i, alpha, beta, args...);
         if (time < min_time) {
           min_time = time;
           best_idx = i;
diff --git a/paddle/phi/kernels/cpu/min_max_with_index_kernel.cc b/paddle/phi/kernels/cpu/min_max_with_index_kernel.cc
new file mode 100644
index 00000000000000..f373553389e422
--- /dev/null
+++ b/paddle/phi/kernels/cpu/min_max_with_index_kernel.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/min_max_with_index_kernel.h"
+
+#include "paddle/common/ddim.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#define DEFINE_WITH_INDEX_KERNEL(OpType, name)                          \
+  template <typename T, typename Context>                               \
+  void OpType##WithIndexKernel(const Context& dev_ctx,                  \
+                               const DenseTensor& x,                    \
+                               const Scalar& dim,                       \
+                               bool keepdim,                            \
+                               bool flatten,                            \
+                               DenseTensor* val_out,                    \
+                               DenseTensor* ind_out) {                  \
+    PADDLE_ENFORCE_EQ(0,                                                \
+                      1,                                                \
+                      phi::errors::Unimplemented(                       \
+                          "In static graph mode, %s PHI kernel is not " \
+                          "currently available on non-GPU devices.",    \
+                          #name));                                      \
+  }                                                                     \
+  template <typename T, typename Context>                               \
+  void OpType##WithIndexGradKernel(const Context& dev_ctx,              \
+                                   const DenseTensor& x,                \
+                                   const DenseTensor& values,           \
+                                   const DenseTensor& indices,          \
+                                   const DenseTensor& values_grad,      \
+                                   const Scalar& dim,                   \
+                                   bool keepdim,                        \
+                                   DenseTensor* x_grad) {               \
+    PADDLE_ENFORCE_EQ(0,                                                \
+                      1,                                                \
+                      phi::errors::Unimplemented(                       \
+                          "In static graph mode, %s PHI kernel is not " \
+                          "currently available on non-GPU devices.",    \
+                          #name));                                      \
+  }
+
+namespace phi {
+
+DEFINE_WITH_INDEX_KERNEL(Min, min_with_index)
+DEFINE_WITH_INDEX_KERNEL(Max, max_with_index)
+#undef DEFINE_WITH_INDEX_KERNEL
+
+}  // namespace phi
+
+#define REGISTER_CPU_KERNELS(OpType, OpName)                   \
+  PD_REGISTER_KERNEL(OpName,                                   \
+                     CPU,                                      \
+                     ALL_LAYOUT,                               \
+                     phi::OpType##WithIndexKernel,             \
+                     phi::dtype::float16,                      \
+                     phi::dtype::bfloat16,                     \
+                     float,                                    \
+                     double,                                   \
+                     int32_t,                                  \
+                     int64_t,                                  \
+                     int16_t,                                  \
+                     uint8_t) {                                \
+    kernel->OutputAt(0).SetDataType(kernel->InputAt(0).dtype); \
+    kernel->OutputAt(1).SetDataType(phi::DataType::INT64);     \
+  }                                                            \
+  PD_REGISTER_KERNEL(OpName##_grad,                            \
+                     CPU,                                      \
+                     ALL_LAYOUT,                               \
+                     phi::OpType##WithIndexGradKernel,         \
+                     float,                                    \
+                     double,                                   \
+                     uint8_t,                                  \
+                     int,                                      \
+                     int16_t,                                  \
+                     int64_t,                                  \
+                     phi::dtype::float16,                      \
+                     phi::dtype::bfloat16) {}
+
+REGISTER_CPU_KERNELS(Min, min_with_index)
+REGISTER_CPU_KERNELS(Max, max_with_index)
+#undef REGISTER_CPU_KERNELS
diff --git a/paddle/phi/kernels/funcs/cross_entropy.cc b/paddle/phi/kernels/funcs/cross_entropy.cc
index 6616f07e68a10c..9fb68c155402f5 100644
--- a/paddle/phi/kernels/funcs/cross_entropy.cc
+++ b/paddle/phi/kernels/funcs/cross_entropy.cc
@@ -93,7 +93,7 @@ struct HardLabelCrossEntropyCPUFunctorImpl {
 
 template <typename DeviceContext, typename T>
 void CrossEntropyFunctor<DeviceContext, T>::operator()(
-    const DeviceContext& ctx,
+    const DeviceContext& dev_ctx,
     phi::DenseTensor* out,
     const phi::DenseTensor* prob,
     const phi::DenseTensor* labels,
@@ -110,7 +110,7 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
     auto lbl = EigenMatrix<T>::From(*labels);
     auto loss = EigenMatrix<T>::From(*out);
 
-    loss.device(*ctx.eigen_device()) =
+    loss.device(*dev_ctx.eigen_device()) =
         -((lbl * in.log().unaryExpr(phi::funcs::TolerableValue<T>()))
               .reshape(batch_axis_remain)
               .sum(Eigen::DSizes<int, 1>(1)));
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index f351e74260c022..b532b1a90163ca 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -1378,7 +1378,7 @@ struct CopySignGradXYFunctor {
     if (x == static_cast<InT>(0))
       outs[0] = static_cast<OutT>(0);
     else
-      outs[0] = static_cast<OutT>(dout * (funcs::copysign_func(x, y)) / x);
+      outs[0] = static_cast<OutT>(dout * (funcs::copysign_func(x, y) / x));
     // dy = 0
     outs[1] = static_cast<OutT>(0);
     return outs;
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cc b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
index 95c9f69a2abfd8..f7274faebd6f08 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.cc
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
@@ -77,7 +77,7 @@ struct cpu_gather_scatter_functor {
                   const std::string& method_name,
                   const func_t& reduce_op,
                   bool include_self,
-                  const phi::DeviceContext& ctx UNUSED) {
+                  const phi::DeviceContext& dev_ctx UNUSED) {
     if (index.numel() == 0) {
       return;
     }
@@ -237,7 +237,7 @@ void cpu_gather_kernel(phi::DenseTensor self,
                        const phi::DenseTensor& index,
                        phi::DenseTensor result,
                        bool include_self,
-                       const phi::DeviceContext& ctx) {
+                       const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/false>()(result,
@@ -247,7 +247,7 @@ void cpu_gather_kernel(phi::DenseTensor self,
                                                           "gather_out_cpu",
                                                           tensor_assign,
                                                           include_self,
-                                                          ctx);
+                                                          dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -256,7 +256,7 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self,
                                const phi::DenseTensor& index,
                                phi::DenseTensor src,
                                bool include_self,
-                               const phi::DeviceContext& ctx) {
+                               const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(self,
@@ -266,7 +266,7 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self,
                                                          "scatter_assign_cpu",
                                                          tensor_assign,
                                                          include_self,
-                                                         ctx);
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -275,11 +275,17 @@ void cpu_scatter_add_kernel(phi::DenseTensor self,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
                             bool include_self,
-                            const phi::DeviceContext& ctx) {
+                            const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_add_cpu", reduce_add, include_self, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_add_cpu",
+                                                         reduce_add,
+                                                         include_self,
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -288,11 +294,17 @@ void cpu_scatter_mul_kernel(phi::DenseTensor self,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
                             bool include_self,
-                            const phi::DeviceContext& ctx) {
+                            const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_mul_cpu", reduce_mul, include_self, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_mul_cpu",
+                                                         reduce_mul,
+                                                         include_self,
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -301,11 +313,17 @@ void cpu_scatter_mean_kernel(phi::DenseTensor self,
                              const phi::DenseTensor& index,
                              phi::DenseTensor src,
                              bool include_self,
-                             const phi::DeviceContext& ctx) {
+                             const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_mean_cpu", reduce_add, include_self, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_mean_cpu",
+                                                         reduce_add,
+                                                         include_self,
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -314,11 +332,17 @@ void cpu_scatter_max_kernel(phi::DenseTensor self,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
                             bool include_self,
-                            const phi::DeviceContext& ctx) {
+                            const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_max_cpu", reduce_max, include_self, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_max_cpu",
+                                                         reduce_max,
+                                                         include_self,
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -327,11 +351,17 @@ void cpu_scatter_min_kernel(phi::DenseTensor self,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
                             bool include_self,
-                            const phi::DeviceContext& ctx) {
+                            const phi::DeviceContext& dev_ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_min_cpu", reduce_min, include_self, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_min_cpu",
+                                                         reduce_min,
+                                                         include_self,
+                                                         dev_ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -340,7 +370,7 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
                                    bool include_self UNUSED,
-                                   const phi::DeviceContext& ctx UNUSED) {
+                                   const phi::DeviceContext& dev_ctx UNUSED) {
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
 
@@ -376,16 +406,17 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED,
 }
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self UNUSED,
-                                               int dim,
-                                               const phi::DenseTensor& index,
-                                               const phi::DenseTensor& out,
-                                               const phi::DenseTensor& x,
-                                               const phi::DenseTensor& value,
-                                               phi::DenseTensor grad,
-                                               const std::string& reduce,
-                                               bool include_self UNUSED,
-                                               const phi::DeviceContext& ctx) {
+void cpu_scatter_mul_min_max_input_grad_kernel(
+    phi::DenseTensor self UNUSED,
+    int dim,
+    const phi::DenseTensor& index,
+    const phi::DenseTensor& out,
+    const phi::DenseTensor& x,
+    const phi::DenseTensor& value,
+    phi::DenseTensor grad,
+    const std::string& reduce,
+    bool include_self UNUSED,
+    const phi::DeviceContext& dev_ctx) {
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
   auto* out_data = out.data<tensor_t>();
@@ -457,7 +488,8 @@ void cpu_scatter_mean_input_grad_kernel(phi::DenseTensor self UNUSED,
                                         const phi::DenseTensor& index,
                                         phi::DenseTensor grad,
                                         bool include_self UNUSED,
-                                        const phi::DeviceContext& ctx UNUSED) {
+                                        const phi::DeviceContext& dev_ctx
+                                            UNUSED) {
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
 
@@ -504,7 +536,7 @@ void cpu_scatter_value_grad_kernel(phi::DenseTensor self,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
                                    bool include_self UNUSED,
-                                   const phi::DeviceContext& ctx UNUSED) {
+                                   const phi::DeviceContext& dev_ctx UNUSED) {
   auto* self_data = self.data<tensor_t>();
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
@@ -564,7 +596,7 @@ void cpu_scatter_add_mean_value_grad_kernel(
     phi::DenseTensor grad,
     const std::string& reduce,
     bool include_self,
-    const phi::DeviceContext& ctx UNUSED) {
+    const phi::DeviceContext& dev_ctx UNUSED) {
   auto* self_data = self.data<tensor_t>();
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
@@ -643,16 +675,17 @@ void cpu_scatter_add_mean_value_grad_kernel(
 }
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self,
-                                               int dim,
-                                               const phi::DenseTensor& index,
-                                               const phi::DenseTensor& out,
-                                               const phi::DenseTensor& x,
-                                               const phi::DenseTensor& value,
-                                               phi::DenseTensor grad,
-                                               const std::string& reduce,
-                                               bool include_self,
-                                               const phi::DeviceContext& ctx) {
+void cpu_scatter_mul_min_max_value_grad_kernel(
+    phi::DenseTensor self,
+    int dim,
+    const phi::DenseTensor& index,
+    const phi::DenseTensor& out,
+    const phi::DenseTensor& x,
+    const phi::DenseTensor& value,
+    phi::DenseTensor grad,
+    const std::string& reduce,
+    bool include_self,
+    const phi::DeviceContext& dev_ctx) {
   auto* self_data = self.data<tensor_t>();
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
diff --git a/paddle/phi/kernels/funcs/math/cos_sim_functor.cc b/paddle/phi/kernels/funcs/math/cos_sim_functor.cc
index 60f1b388de3ad0..cba26a884e72c2 100644
--- a/paddle/phi/kernels/funcs/math/cos_sim_functor.cc
+++ b/paddle/phi/kernels/funcs/math/cos_sim_functor.cc
@@ -18,7 +18,7 @@ namespace phi {
 namespace math {
 template <typename T>
 struct CosSimDyFunctor<phi::CPUContext, T> {
-  void operator()(const phi::CPUContext& ctx,
+  void operator()(const phi::CPUContext& dev_ctx,
                   const T* x_norm,
                   const T* y_norm,
                   const T* x,
diff --git a/paddle/phi/kernels/funcs/math/cos_sim_functor.cu b/paddle/phi/kernels/funcs/math/cos_sim_functor.cu
index 762178b4a9d613..f37fd91ee87efd 100644
--- a/paddle/phi/kernels/funcs/math/cos_sim_functor.cu
+++ b/paddle/phi/kernels/funcs/math/cos_sim_functor.cu
@@ -50,7 +50,7 @@ __global__ void CosSimDyKernel(const T* x_norm,
 
 template <typename T>
 struct CosSimDyFunctor<phi::GPUContext, T> {
-  void operator()(const phi::GPUContext& ctx,
+  void operator()(const phi::GPUContext& dev_ctx,
                   const T* x_norm,
                   const T* y_norm,
                   const T* x,
@@ -63,7 +63,7 @@ struct CosSimDyFunctor<phi::GPUContext, T> {
     const int block_size = 512;
     dim3 threads(block_size, 1);
     dim3 grid((rows + block_size - 1) / block_size, 1);
-    CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>(
+    CosSimDyKernel<T><<<grid, threads, 0, dev_ctx.stream()>>>(
         x_norm, y_norm, x, y, z, dz, rows, cols, dy);
   }
 };
diff --git a/paddle/phi/kernels/funcs/math/cos_sim_functor.h b/paddle/phi/kernels/funcs/math/cos_sim_functor.h
index e01af90df4d4e1..ed2e71f8af8d8f 100644
--- a/paddle/phi/kernels/funcs/math/cos_sim_functor.h
+++ b/paddle/phi/kernels/funcs/math/cos_sim_functor.h
@@ -174,7 +174,7 @@ struct CosSimDxFunctor {
 
 template <typename DeviceContext, typename T>
 struct CosSimDyFunctor {
-  void operator()(const DeviceContext& ctx,
+  void operator()(const DeviceContext& dev_ctx,
                   const T* x_norm,
                   const T* y_norm,
                   const T* x,
diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
index 32afd7fdaa1b5d..a1da63a3ab9628 100644
--- a/paddle/phi/kernels/funcs/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -404,7 +404,7 @@ __global__ void scatter_gather_elementwise_kernel(int N, func_t f) {
 }
 
 template <typename T, typename IndexT = int>
-void GPUScatterAdd(const phi::GPUContext& ctx,
+void GPUScatterAdd(const phi::GPUContext& dev_ctx,
                    const DenseTensor& src,
                    const DenseTensor& index,
                    DenseTensor* output,
@@ -483,7 +483,7 @@ void GPUScatterAdd(const phi::GPUContext& ctx,
   constexpr int vt = 8;
   const dim3 block(nt);
   const dim3 grid((N + block.x * vt - 1) / (block.x * vt));
-  auto stream = ctx.stream();
+  auto stream = dev_ctx.stream();
 
   scatter_gather_elementwise_kernel<nt, vt>
       <<<grid, block, 0, stream>>>(N, reduce_add);
diff --git a/paddle/phi/kernels/funcs/unique_functor.h b/paddle/phi/kernels/funcs/unique_functor.h
index 758b9160096d09..fafb1b284c60a8 100644
--- a/paddle/phi/kernels/funcs/unique_functor.h
+++ b/paddle/phi/kernels/funcs/unique_functor.h
@@ -23,6 +23,35 @@
 namespace phi {
 namespace funcs {
 
+template <typename T>
+static bool NaNSafeEqual(const T& a, const T& b) {
+  if constexpr (std::is_floating_point_v<T>) {
+    if (std::isnan(a) && std::isnan(b)) {
+      return &a == &b;
+    }
+    if (std::isnan(a) || std::isnan(b)) {
+      return false;
+    }
+  }
+  return a == b;
+}
+
+template <typename T>
+static bool NaNSafeLess(const T& a, const T& b) {
+  if constexpr (std::is_floating_point_v<T>) {
+    if (std::isnan(a) && !std::isnan(b)) {
+      return false;
+    }
+    if (!std::isnan(a) && std::isnan(b)) {
+      return true;
+    }
+    if (std::isnan(a) && std::isnan(b)) {
+      return &a < &b;
+    }
+  }
+  return a < b;
+}
+
 template <typename Context, typename InT>
 struct UniqueOpFunctor {
   const Context& dev_ctx_;
@@ -122,7 +151,7 @@ static bool Equal(const DenseTensor& a, const DenseTensor& b) {
     return false;
   }
   for (int64_t i = 0; i < a.numel(); ++i) {
-    if (a.data<T>()[i] != b.data<T>()[i]) {
+    if (!NaNSafeEqual(a.data<T>()[i], b.data<T>()[i])) {
       return false;
     }
   }
@@ -140,7 +169,15 @@ static void UniqueFlattenedTensor(const Context& dev_ctx,
                                   bool return_inverse,
                                   bool return_counts) {
   const InT* in_data = in.data<InT>();
-  std::set<InT> unique(in_data, in_data + in.numel());
+
+  auto nan_safe_comp = [](const InT& a, const InT& b) {
+    return NaNSafeLess(a, b);
+  };
+  std::set<InT, decltype(nan_safe_comp)> unique(nan_safe_comp);
+  for (int64_t i = 0; i < in.numel(); ++i) {
+    unique.insert(in_data[i]);
+  }
+
   out->Resize(common::make_ddim({static_cast<int64_t>(unique.size())}));
   auto* out_data = dev_ctx.template Alloc<InT>(out);
   std::copy(unique.begin(), unique.end(), out_data);
@@ -162,29 +199,27 @@ static void UniqueFlattenedTensor(const Context& dev_ctx,
   if (return_inverse) {
     index->Resize(common::make_ddim({in.numel()}));
     auto inverse_data = dev_ctx.template Alloc<IndexT>(index);
-    std::unordered_map<InT, IndexT> inverse_map;
-    inverse_map.reserve(out->numel());
-    for (int64_t i = 0; i < out->numel(); ++i) {
-      inverse_map[out_data[i]] = i;
-    }
     for (int64_t i = 0; i < in.numel(); ++i) {
-      inverse_data[i] = inverse_map[in_data[i]];
+      for (int64_t j = 0; j < out->numel(); ++j) {
+        if (NaNSafeEqual(in_data[i], out_data[j])) {
+          inverse_data[i] = j;
+          break;
+        }
+      }
     }
   }
 
   if (return_counts) {
     count->Resize(common::make_ddim({out->numel()}));
     auto count_data = dev_ctx.template Alloc<IndexT>(count);
-    std::unordered_map<InT, IndexT> counts_map;
-    counts_map.reserve(out->numel());
     for (int64_t i = 0; i < out->numel(); ++i) {
-      counts_map[out_data[i]] = 0;
-    }
-    for (int64_t i = 0; i < in.numel(); i++) {
-      counts_map[in_data[i]] += 1;
-    }
-    for (int64_t i = 0; i < out->numel(); i++) {
-      count_data[i] = counts_map[out_data[i]];
+      IndexT cnt = 0;
+      for (int64_t j = 0; j < in.numel(); ++j) {
+        if (NaNSafeEqual(out_data[i], in_data[j])) {
+          cnt++;
+        }
+      }
+      count_data[i] = cnt;
     }
   }
 }
diff --git a/paddle/phi/kernels/fusion/gpu/fused_transpose_split_quant_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_transpose_split_quant_kernel.cu
index 23ddde393f3dd2..16503aa32f263d 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_transpose_split_quant_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_transpose_split_quant_kernel.cu
@@ -29,43 +29,62 @@ struct __align__(sizeof(T) * VecSize) VecType {
   }
 };
 
-template <int VecSize>
-__device__ void BlockLoad(const phi::bfloat16* input,
+template <typename InT, int VecSize>
+__device__ void BlockLoad(const InT* input,
+                          const float* input_scales,
                           __nv_bfloat16 x[8][4],
-                          size_t K) {
+                          size_t K,
+                          size_t k_scaled) {
+  constexpr bool need_dequant = std::is_same_v<InT, phi::dtype::float8_e4m3fn>;
+
+#pragma unroll
   for (uint32_t i = 0; i < 8; i++) {
-    size_t off_m = blockIdx.x * size_t(128) + threadIdx.y + i * 16;
-    size_t off_k = blockIdx.y * 128 + threadIdx.x * VecSize;
-    size_t offset = off_m * K + off_k;
+    const uint32_t local_off_M = threadIdx.y + i * 16;
+    const uint32_t off_m = blockIdx.x * 128 + local_off_M;
+    const uint32_t off_k = blockIdx.y * 128 + threadIdx.x * VecSize;
+    const size_t offset = off_m * K + off_k;
+
+    float scale;
+    if constexpr (need_dequant) {
+      const uint32_t m_base = blockIdx.x * 128;
+      const uint32_t m_stride = k_scaled;
+      scale = input_scales[off_m * m_stride + blockIdx.y];
+    }
 
+#pragma unroll
     for (uint32_t j = 0; j < 4; j += VecSize) {
-      if (off_k + j * 32 < K) {
-        size_t idx = offset + j * 32;
-        using LoadT = VecType<__nv_bfloat16, VecSize>;
-        LoadT data = *reinterpret_cast<const LoadT*>(input + idx);
-        for (uint32_t k = 0; k < VecSize; k++) {
-          x[i][j + k] = data[k];
+      const size_t idx = offset + j * 32;
+      using LoadT = VecType<InT, VecSize>;
+      LoadT data = *reinterpret_cast<const LoadT*>(input + idx);
+#pragma unroll
+      for (uint32_t k = 0; k < VecSize; k++) {
+        if constexpr (need_dequant) {
+          x[i][j + k] = __float2bfloat16(static_cast<float>(data[k]) * scale);
+        } else {
+          x[i][j + k] = (*reinterpret_cast<__nv_bfloat16*>(&data[k]));
         }
       }
     }
   }
 }
-
 template <bool Pow2Scales>
 __device__ void BlockColumnScale(const __nv_bfloat16 x[8][4],
-                                 float col_scale[128],
+                                 float scales[128],
                                  __nv_bfloat16* shm) {
   // reduce [(8), 16, 32, 4] => [16, 32, 4]
   __nv_bfloat16 warp_max[4];
+#pragma unroll
   for (uint32_t i = 0; i < 8; i++) {
+#pragma unroll
     for (uint32_t j = 0; j < 4; j++) {
-      __nv_bfloat16 t = BF16_ABS(x[i][j]);
+      const __nv_bfloat16 t = BF16_ABS(x[i][j]);
       warp_max[j] = i == 0 ? t : BF16_MAX(warp_max[j], t);
     }
   }
 
   // reduce [(16), 32, 4] => [8, 32, 4]
   if (threadIdx.y >= 8) {
+#pragma unroll
     for (uint32_t j = 0; j < 4; j++) {
       shm[(threadIdx.y - 8) * 128 + threadIdx.x + j * 32] = warp_max[j];
     }
@@ -75,8 +94,9 @@ __device__ void BlockColumnScale(const __nv_bfloat16 x[8][4],
   // reduce [(8), 32, 4] => [32, 4]
   for (uint32_t offset = 8; offset > 0; offset /= 2) {
     if (threadIdx.y < offset) {
+#pragma unroll
       for (uint32_t j = 0; j < 4; j++) {
-        __nv_bfloat16 other =
+        const __nv_bfloat16 other =
             offset == 8
                 ? warp_max[j]
                 : shm[(threadIdx.y + offset) * 128 + threadIdx.x + j * 32];
@@ -85,7 +105,7 @@ __device__ void BlockColumnScale(const __nv_bfloat16 x[8][4],
         if (offset > 1) {
           shm[threadIdx.y * 128 + threadIdx.x + j * 32] = next_val;
         } else {
-          col_scale[threadIdx.x + j * 32] =
+          scales[threadIdx.x + j * 32] =
               ComputeScale<__nv_bfloat16, __nv_fp8_e4m3, Pow2Scales>(
                   static_cast<float>(next_val), 0.0f);
         }
@@ -98,7 +118,7 @@ __device__ void BlockColumnScale(const __nv_bfloat16 x[8][4],
 template <typename OutT, int VecSize>
 __device__ void BlockStoreScale(float* scale,
                                 size_t off_m,
-                                float col_scale[128],
+                                float scales[128],
                                 size_t K) {
   if (threadIdx.y < 4) {
     uint32_t off = threadIdx.y * 32 + threadIdx.x;
@@ -107,10 +127,10 @@ __device__ void BlockStoreScale(float* scale,
     } else if constexpr (VecSize == 2) {
       off = (off / 64) * 64 + (off % 2) * 32 + (off % 64) / 2;
     }
-    float scale_out = 1.0f / col_scale[off];
-    size_t idx_y = blockIdx.x - off_m / 128;
-    size_t idx_x = blockIdx.y * 128 + threadIdx.y * 32 + threadIdx.x;
-    size_t idx = idx_y * K + idx_x;
+    float scale_out = 1.0f / scales[off];
+    const size_t idx_y = blockIdx.x - off_m / 128;
+    const size_t idx_x = blockIdx.y * 128 + threadIdx.y * 32 + threadIdx.x;
+    const size_t idx = idx_y * K + idx_x;
     if (idx_x < K) {
       scale[idx] = scale_out;
     }
@@ -123,14 +143,16 @@ __device__ void BlockStoreOut(OutT* out,
                               size_t cur_tokens,
                               const OutT shm[128][129],
                               size_t K) {
+#pragma unroll
   for (uint32_t i = 0; i < 8; i++) {
-    size_t idx_m = blockIdx.x * size_t(128) + threadIdx.x * 4;
-    size_t idx_k = blockIdx.y * 128 + threadIdx.y + i * 16;
-    size_t idx = idx_k * cur_tokens + (idx_m - off_m);
+    const size_t idx_m = blockIdx.x * size_t(128) + threadIdx.x * 4;
+    const size_t idx_k = blockIdx.y * 128 + threadIdx.y + i * 16;
+    const size_t idx = idx_k * cur_tokens + (idx_m - off_m);
 
     if (idx_k < K) {
       using StoreT = VecType<OutT, VecSize>;
       StoreT data;
+#pragma unroll
       for (uint32_t j = 0; j < VecSize; j++) {
         data[j] = shm[i * 16 + threadIdx.y][threadIdx.x * 4 + j];
       }
@@ -139,23 +161,27 @@ __device__ void BlockStoreOut(OutT* out,
   }
 }
 
-template <typename OutT, bool Pow2Scales, int VecSize>
+template <typename InT, typename OutT, bool Pow2Scales, int VecSize>
 __global__ void __launch_bounds__(512)
-    FusedTransposeSplitQuantKernel(const phi::bfloat16* __restrict__ input,
+    FusedTransposeSplitQuantKernel(const InT* __restrict__ input,
+                                   const float* __restrict__ input_scales,
                                    int64_t* __restrict__ meta,
                                    size_t num_experts,
-                                   size_t K) {
+                                   size_t K,
+                                   size_t k_scaled) {
   __shared__ OutT shm[128][129];
+  __shared__ size_t expert_info[2];
+  __shared__ float scales[128];  // May be reused? Is it worthy?
+
   int64_t* tokens_per_expert = meta;
   OutT** out_ptrs = reinterpret_cast<OutT**>(meta + num_experts);
   float** scale_ptrs = reinterpret_cast<float**>(meta + num_experts * 2);
 
   // 1. Load 128x128 elements from input
   __nv_bfloat16 x[8][4];
-  BlockLoad<VecSize>(input, x, K);
+  BlockLoad<InT, VecSize>(input, input_scales, x, K, k_scaled);
 
   // 2. Get expert index and offset of the current block
-  __shared__ size_t expert_info[2];
   if (threadIdx.x == 0 && threadIdx.y == 0) {
     size_t idx_m = blockIdx.x * size_t(128);
     size_t off_m = 0, next_off_m = 0;
@@ -172,21 +198,23 @@ __global__ void __launch_bounds__(512)
   }
 
   // 3. Calculate scale along the column
-  __shared__ float col_scale[128];
   BlockColumnScale<Pow2Scales>(
-      x, col_scale, reinterpret_cast<__nv_bfloat16*>(shm));
+      x, scales, reinterpret_cast<__nv_bfloat16*>(shm));
 
   // 4. Store scale
   const size_t expert_idx = expert_info[0];
   const size_t off_m = expert_info[1];
-  BlockStoreScale<OutT, VecSize>(scale_ptrs[expert_idx], off_m, col_scale, K);
+  BlockStoreScale<OutT, VecSize>(scale_ptrs[expert_idx], off_m, scales, K);
 
-  // 5. Scale x and save into shared memory with transposed layout
+// 5. Scale x and save into shared memory with transposed layout
+#pragma unroll
   for (uint32_t i = 0; i < 8; i++) {
+#pragma unroll
     for (uint32_t j = 0; j < 4; j += VecSize) {
+#pragma unroll
       for (uint32_t k = 0; k < VecSize; k++) {
         float x_fp32 = static_cast<float>(x[i][j + k]);
-        float x_scaled = x_fp32 * col_scale[threadIdx.x + (j + k) * 32];
+        float x_scaled = x_fp32 * scales[threadIdx.x + (j + k) * 32];
         shm[threadIdx.x * VecSize + j * 32 + k][i * 16 + threadIdx.y] =
             static_cast<OutT>(x_scaled);
       }
@@ -204,10 +232,11 @@ template <typename T, typename Context>
 void FusedTransposeSplitQuantKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
+    const paddle::optional<DenseTensor>& input_scales,
     const std::vector<int64_t>& tokens_per_expert,
     bool pow_2_scales,
     std::vector<DenseTensor*> outs,
-    std::vector<DenseTensor*> scales) {
+    std::vector<DenseTensor*> output_scales) {
   auto x_dims = x.dims();
   const int64_t M = x_dims[0];
   const int64_t K = x_dims[1];
@@ -221,8 +250,8 @@ void FusedTransposeSplitQuantKernel(
     if (outs[i] != nullptr) {
       dev_ctx.template Alloc<phi::dtype::float8_e4m3fn>(outs[i]);
     }
-    if (scales[i] != nullptr) {
-      dev_ctx.template Alloc<float>(scales[i]);
+    if (output_scales[i] != nullptr) {
+      dev_ctx.template Alloc<float>(output_scales[i]);
     }
   }
 
@@ -245,8 +274,8 @@ void FusedTransposeSplitQuantKernel(
 
   for (size_t i = 0; i < num_experts; i++) {
     meta_ptr[num_experts * 2 + i] =
-        scales[i] != nullptr
-            ? reinterpret_cast<int64_t>(scales[i]->data<float>())
+        output_scales[i] != nullptr
+            ? reinterpret_cast<int64_t>(output_scales[i]->data<float>())
             : 0;
   }
 
@@ -255,23 +284,35 @@ void FusedTransposeSplitQuantKernel(
 
   auto stream = dev_ctx.stream();
 
-  dim3 grid(M / 128, (K + 127) / 128);
+  // pre-compute on CPU to reduce size_t division cost in kernel
+  const size_t k_scaled = (K + 127) / 128;
+  dim3 grid(M / 128, k_scaled);
   dim3 block(32, 16);
 
-#define LAUNCH_KERNEL(POW_2_SCALES, VEC_SIZE)                      \
-  FusedTransposeSplitQuantKernel<phi::dtype::float8_e4m3fn,        \
-                                 POW_2_SCALES,                     \
-                                 VEC_SIZE>                         \
-      <<<grid, block, 0, stream>>>(x.data<phi::dtype::bfloat16>(), \
-                                   meta_gpu.data<int64_t>(),       \
-                                   num_experts,                    \
-                                   K);
+#define DTYPE_CASE(dtype, type) dtype == phi::DataType::type
+#define LAUNCH_KERNEL(T, POW_2_SCALES, VEC_SIZE)                        \
+  FusedTransposeSplitQuantKernel<T,                                     \
+                                 phi::dtype::float8_e4m3fn,             \
+                                 POW_2_SCALES,                          \
+                                 VEC_SIZE><<<grid, block, 0, stream>>>( \
+      x.data<T>(),                                                      \
+      input_scales ? input_scales.get_ptr()->data<float>() : nullptr,   \
+      meta_gpu.data<int64_t>(),                                         \
+      num_experts,                                                      \
+      K,                                                                \
+      k_scaled);
+#define DISPATCH_DATATYPE(POW_2_SCALES, VEC_SIZE)              \
+  if (DTYPE_CASE(x.dtype(), BFLOAT16)) {                       \
+    LAUNCH_KERNEL(phi::bfloat16, POW_2_SCALES, VEC_SIZE);      \
+  } else if (DTYPE_CASE(x.dtype(), FLOAT8_E4M3FN)) {           \
+    LAUNCH_KERNEL(phi::float8_e4m3fn, POW_2_SCALES, VEC_SIZE); \
+  }
 
 #define LAUNCH_KERNEL_PARTIAL(VEC_SIZE) \
   if (pow_2_scales) {                   \
-    LAUNCH_KERNEL(true, VEC_SIZE);      \
+    DISPATCH_DATATYPE(true, VEC_SIZE);  \
   } else {                              \
-    LAUNCH_KERNEL(false, VEC_SIZE);     \
+    DISPATCH_DATATYPE(false, VEC_SIZE); \
   }
 
   if (K % 4 == 0) {
@@ -296,7 +337,8 @@ PD_REGISTER_KERNEL(fused_transpose_split_quant,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::bfloat16) {
+                   phi::dtype::bfloat16,
+                   phi::dtype::float8_e4m3fn) {
   kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT8_E4M3FN);
   kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
 }
diff --git a/paddle/phi/kernels/gpu/cuda_gemm_kernel.h b/paddle/phi/kernels/gpu/cuda_gemm_kernel.h
index f13831bc25034b..0efe77d7817dc0 100644
--- a/paddle/phi/kernels/gpu/cuda_gemm_kernel.h
+++ b/paddle/phi/kernels/gpu/cuda_gemm_kernel.h
@@ -26,7 +26,7 @@ typedef struct {
 } GemmParams;
 
 template <typename T, typename Context>
-void CudaGemm(const Context& ctx,
+void CudaGemm(const Context& dev_ctx,
               const DenseTensor& input,
               const DenseTensor& w,
               DenseTensor* output);
diff --git a/paddle/phi/kernels/gpu/min_max_with_index_grad_kernel.cu b/paddle/phi/kernels/gpu/min_max_with_index_grad_kernel.cu
new file mode 100644
index 00000000000000..f34d03bf07e506
--- /dev/null
+++ b/paddle/phi/kernels/gpu/min_max_with_index_grad_kernel.cu
@@ -0,0 +1,115 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+using EnableIfInteger =
+    typename std::enable_if<std::is_integral<T>::value, int>::type;
+
+template <typename T>
+using EnableIfNonInteger =
+    typename std::enable_if<!std::is_integral<T>::value, int>::type;
+
+// Here if keepdim=True, this will fallback to a simplified version of
+// take_along_axis. However, if keepdim=False (by default), indices will
+// not have equal rank will the input values (and values_grad), therefore
+// needs an unsqueeze operation by shallow copying indices and Resize
+#define DEFINE_WITH_INDEX_GRAD_KERNEL(OpType)                                \
+  template <typename T, typename Context, EnableIfNonInteger<T> = 0>         \
+  void OpType##WithIndexGradKernel(const Context& dev_ctx,                   \
+                                   const DenseTensor& x,                     \
+                                   const DenseTensor& values,                \
+                                   const DenseTensor& indices,               \
+                                   const DenseTensor& values_grad,           \
+                                   const Scalar& dim,                        \
+                                   bool keepdim,                             \
+                                   DenseTensor* x_grad) {                    \
+    x_grad->Resize(x.dims());                                                \
+    dev_ctx.template Alloc<T>(x_grad);                                       \
+    if (x_grad->numel() == 0) {                                              \
+      return;                                                                \
+    }                                                                        \
+    int64_t dim_val = dim.to<int64_t>();                                     \
+    if (dim_val < 0) {                                                       \
+      dim_val += x.dims().size();                                            \
+    }                                                                        \
+    DenseTensor shallow_copied_inds(indices);                                \
+    if (!keepdim) {                                                          \
+      auto indices_dim = x.dims();                                           \
+      indices_dim[dim_val] = 1;                                              \
+      shallow_copied_inds.Resize(indices_dim);                               \
+    }                                                                        \
+    phi::funcs::SetConstant<Context, T> functor;                             \
+    functor(dev_ctx, x_grad, static_cast<T>(0));                             \
+    phi::funcs::gpu_scatter_add_kernel<T, int64_t>(                          \
+        *x_grad, dim_val, shallow_copied_inds, values_grad, true, dev_ctx);  \
+  }                                                                          \
+  template <typename T, typename Context, EnableIfInteger<T> = 0>            \
+  void OpType##WithIndexGradKernel(const Context& dev_ctx,                   \
+                                   const DenseTensor& x,                     \
+                                   const DenseTensor& values,                \
+                                   const DenseTensor& indices,               \
+                                   const DenseTensor& values_grad,           \
+                                   const Scalar& dim,                        \
+                                   bool keepdim,                             \
+                                   DenseTensor* x_grad) {                    \
+    std::string dtype_name = phi::DataTypeToString(values.dtype());          \
+    PADDLE_ENFORCE_EQ(                                                       \
+        0,                                                                   \
+        1,                                                                   \
+        phi::errors::InvalidArgument(                                        \
+            "Integer type '%s' is not allowed to have stop_gradient=False.", \
+            dtype_name.c_str()));                                            \
+  }
+
+DEFINE_WITH_INDEX_GRAD_KERNEL(Max)
+DEFINE_WITH_INDEX_GRAD_KERNEL(Min)
+
+#undef DEFINE_WITH_INDEX_GRAD_KERNEL
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(max_with_index_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxWithIndexGradKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int,
+                   int16_t,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(min_with_index_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MinWithIndexGradKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int,
+                   int16_t,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/min_max_with_index_kernel.cu b/paddle/phi/kernels/gpu/min_max_with_index_kernel.cu
new file mode 100644
index 00000000000000..2509c34fb0c8fd
--- /dev/null
+++ b/paddle/phi/kernels/gpu/min_max_with_index_kernel.cu
@@ -0,0 +1,312 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/min_max_with_index_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+#include <limits>
+
+#include "paddle/common/ddim.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace phi {
+
+namespace {  // NOLINT
+template <typename K, typename V>
+using KeyValuePair = cub::KeyValuePair<K, V>;
+
+}  // namespace
+
+#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...)  \
+  case (1 << (log2_block_dim)): {                       \
+    constexpr auto kBlockDim = (1 << (log2_block_dim)); \
+    __VA_ARGS__;                                        \
+  } break
+
+#define FIXED_BLOCK_DIM_CASE(...)               \
+  FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
+
+template <typename T,
+          typename IndType,
+          class Reducer,
+          size_t BlockDim,
+          typename IndexType>
+__global__ void MinMaxWithIndexKernel(const int64_t height,     // n * h
+                                      const int64_t width,      // c
+                                      const int64_t post_size,  // h
+                                      const Reducer reducer,
+                                      const T init,
+                                      const T* in,
+                                      T* val_out,
+                                      IndType* key_out) {
+  typedef cub::BlockReduce<KeyValuePair<IndexType, T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (IndexType idx = blockIdx.x; idx < height; idx += gridDim.x) {
+    KeyValuePair<IndexType, T> kv_pair = {-1, init};
+    IndexType h = idx / post_size;
+    IndexType w = idx % post_size;
+    for (IndexType k = threadIdx.x; k < width; k += blockDim.x) {
+      kv_pair =
+          reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
+    }
+    kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
+    if (threadIdx.x == 0) {
+      val_out[idx] = static_cast<T>(kv_pair.value);
+      key_out[idx] = static_cast<IndType>(kv_pair.key);
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, typename IndType, class Reducer, typename IndexType>
+void ComputeMinMaxWithIndex(const phi::GPUContext& dev_ctx,
+                            const DenseTensor& input,
+                            DenseTensor* values,
+                            DenseTensor* indices,
+                            const int64_t pre,
+                            const int64_t post,
+                            const int64_t n) {
+  auto cu_stream = dev_ctx.stream();
+  auto ComputeBlockSize = [](int64_t col) {
+    auto block_size = 8;
+    if (col > 512)
+      block_size = 1024;
+    else if (col > 256)
+      block_size = 512;
+    else if (col > 128)
+      block_size = 256;
+    else if (col > 64)
+      block_size = 128;
+    else if (col > 32)
+      block_size = 64;
+    else if (col > 16)
+      block_size = 32;
+    else if (col > 8)
+      block_size = 16;
+    return block_size;
+  };
+
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
+  int64_t height = pre * post;
+  int64_t width = n;
+  int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
+
+  const T* in_data = input.data<T>();
+
+  T* val_data = dev_ctx.template Alloc<T>(values);
+  IndType* ind_data = dev_ctx.template Alloc<IndType>(indices);
+
+  if (typeid(Reducer) == typeid(cub::ArgMax)) {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          MinMaxWithIndexKernel<T, IndType, Reducer, kBlockDim, IndexType>
+          <<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height,
+              width,
+              post,
+              Reducer(),
+              std::numeric_limits<T>::lowest(),
+              in_data,
+              val_data,
+              ind_data));
+    }
+  } else {
+    switch (ComputeBlockSize(width)) {
+      FIXED_BLOCK_DIM_CASE(
+          MinMaxWithIndexKernel<T, IndType, Reducer, kBlockDim, IndexType>
+          <<<grid_size, kBlockDim, 0, cu_stream>>>(
+              height,
+              width,
+              post,
+              Reducer(),
+              std::numeric_limits<T>::max(),
+              in_data,
+              val_data,
+              ind_data));
+    }
+  }
+}
+
+template <typename Context, typename T, class Reducer>
+struct VisitDataCudaMinMaxWithIndexFunctor {
+  const Context& dev_ctx;
+  const DenseTensor& x;
+  int64_t axis;
+  bool keepdims;
+  bool flatten;
+  DenseTensor* val_out;
+  DenseTensor* ind_out;
+
+  explicit VisitDataCudaMinMaxWithIndexFunctor(const Context& dev_ctx,
+                                               const DenseTensor& x,
+                                               int64_t axis,
+                                               bool keepdims,
+                                               bool flatten,
+                                               DenseTensor* val_out,
+                                               DenseTensor* ind_out)
+      : dev_ctx(dev_ctx),
+        x(x),
+        axis(axis),
+        keepdims(keepdims),
+        flatten(flatten),
+        val_out(val_out),
+        ind_out(ind_out) {}
+
+  template <typename IndType>
+  void apply() const {
+    phi::DDim x_dims;
+    int new_axis = axis;
+    if (flatten) {
+      x_dims = common::make_ddim({x.numel()});
+      // if flatten, the axis just as 0
+      new_axis = 0;
+    } else {
+      x_dims = x.dims();
+      if (axis < 0) new_axis = axis + x.dims().size();
+    }
+    if (x.numel() == 0) {
+      dev_ctx.template Alloc<T>(val_out);
+      dev_ctx.template Alloc<IndType>(ind_out);
+      return;
+    }
+    // For 0D Tensor
+    if (x.dims().size() == 0) {
+      dev_ctx.template Alloc<T>(val_out);
+      dev_ctx.template Alloc<IndType>(ind_out);
+      phi::funcs::set_constant(dev_ctx, ind_out, static_cast<IndType>(0));
+      phi::Copy<Context>(dev_ctx, x, dev_ctx.GetPlace(), false, val_out);
+      return;
+    }
+
+    int64_t numel = x.numel();
+    int64_t groups = numel / x_dims[new_axis];
+    int64_t pre = 1;
+    int64_t post = 1;
+    int64_t n = x_dims[new_axis];
+
+    for (int i = 0; i < new_axis; i++) {
+      pre *= x_dims[i];
+    }
+
+    for (int i = new_axis + 1; i < x_dims.size(); i++) {
+      post *= x_dims[i];
+    }
+
+    if (numel > std::numeric_limits<int32_t>::max()) {
+      ComputeMinMaxWithIndex<T, IndType, Reducer, int64_t>(
+          dev_ctx, x, val_out, ind_out, pre, post, n);
+    } else {
+      ComputeMinMaxWithIndex<T, IndType, Reducer, int32_t>(
+          dev_ctx, x, val_out, ind_out, pre, post, n);
+    }
+  }
+};
+
+template <typename Context, typename T, class Reducer>
+void MinMaxWithIndexOpCUDAKernel(const Context& dev_ctx,
+                                 const DenseTensor& x,
+                                 const Scalar& axis,
+                                 bool keepdims,
+                                 bool flatten,
+                                 DenseTensor* val_out,
+                                 DenseTensor* ind_out) {
+  PADDLE_ENFORCE_GE(
+      x.numel(),
+      0,
+      common::errors::InvalidArgument(
+          "(min/max)_with_index input numel must > 0, bug got %d", x.numel()));
+  phi::VisitDataTypeTiny(
+      phi::DataType::INT64,
+      VisitDataCudaMinMaxWithIndexFunctor<Context, T, Reducer>(
+          dev_ctx, x, axis.to<int64_t>(), keepdims, flatten, val_out, ind_out));
+}
+
+template <typename T, typename Context>
+void MinWithIndexKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const Scalar& dim,
+                        bool keepdim,
+                        bool flatten,
+                        DenseTensor* val_out,
+                        DenseTensor* ind_out) {
+  MinMaxWithIndexOpCUDAKernel<Context, T, cub::ArgMin>(
+      dev_ctx, x, dim, keepdim, flatten, val_out, ind_out);
+}
+
+template <typename T, typename Context>
+void MaxWithIndexKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const Scalar& dim,
+                        bool keepdim,
+                        bool flatten,
+                        DenseTensor* val_out,
+                        DenseTensor* ind_out) {
+  MinMaxWithIndexOpCUDAKernel<Context, T, cub::ArgMax>(
+      dev_ctx, x, dim, keepdim, flatten, val_out, ind_out);
+}
+
+#endif
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(min_with_index,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MinWithIndexKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {
+  kernel->OutputAt(0).SetDataType(kernel->InputAt(0).dtype);
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT64);
+}
+
+PD_REGISTER_KERNEL(max_with_index,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxWithIndexKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t) {
+  kernel->OutputAt(0).SetDataType(kernel->InputAt(0).dtype);
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu
index 95132d09e2cc22..3f55297474015c 100644
--- a/paddle/phi/kernels/gpu/reduce_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_kernel.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/reduce_kernel.h"
+#include <type_traits>
 
 #include "paddle/phi/kernels/gpu/reduce_amin_amax_common.h"
 #include "paddle/phi/kernels/reduce_amin_grad_kernel.h"
diff --git a/paddle/phi/kernels/index_elementwise_get_grad_kernel.h b/paddle/phi/kernels/index_elementwise_get_grad_kernel.h
index 42550bbc08de70..f5d9c3a2847d05 100644
--- a/paddle/phi/kernels/index_elementwise_get_grad_kernel.h
+++ b/paddle/phi/kernels/index_elementwise_get_grad_kernel.h
@@ -20,7 +20,7 @@
 namespace phi {
 
 template <typename T, typename Context>
-void IndexElementwiseGetGradKernel(const Context& ctx,
+void IndexElementwiseGetGradKernel(const Context& dev_ctx,
                                    const DenseTensor& x,
                                    const std::vector<const DenseTensor*>& index,
                                    const DenseTensor& out_grad,
diff --git a/paddle/phi/kernels/legacy/compare_kernel.h b/paddle/phi/kernels/legacy/compare_kernel.h
index 541ec10d244da4..95ea7081a1cfa3 100644
--- a/paddle/phi/kernels/legacy/compare_kernel.h
+++ b/paddle/phi/kernels/legacy/compare_kernel.h
@@ -19,42 +19,42 @@ limitations under the License. */
 namespace phi {
 
 template <typename T, typename Context>
-void LessThanRawKernel(const Context& ctx,
+void LessThanRawKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& y,
                        int axis,
                        DenseTensor* out);
 
 template <typename T, typename Context>
-void LessEqualRawKernel(const Context& ctx,
+void LessEqualRawKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& y,
                         int axis,
                         DenseTensor* out);
 
 template <typename T, typename Context>
-void GreaterThanRawKernel(const Context& ctx,
+void GreaterThanRawKernel(const Context& dev_ctx,
                           const DenseTensor& x,
                           const DenseTensor& y,
                           int axis,
                           DenseTensor* out);
 
 template <typename T, typename Context>
-void GreaterEqualRawKernel(const Context& ctx,
+void GreaterEqualRawKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& y,
                            int axis,
                            DenseTensor* out);
 
 template <typename T, typename Context>
-void EqualRawKernel(const Context& ctx,
+void EqualRawKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& y,
                     int axis,
                     DenseTensor* out);
 
 template <typename T, typename Context>
-void NotEqualRawKernel(const Context& ctx,
+void NotEqualRawKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& y,
                        int axis,
diff --git a/paddle/phi/kernels/legacy/cpu/compare_kernel.cc b/paddle/phi/kernels/legacy/cpu/compare_kernel.cc
index 5b11c81f573a80..77800701c94b26 100644
--- a/paddle/phi/kernels/legacy/cpu/compare_kernel.cc
+++ b/paddle/phi/kernels/legacy/cpu/compare_kernel.cc
@@ -25,23 +25,23 @@ template <typename T,
           typename Context,
           typename Functor,
           typename InverseFunctor>
-inline void CompareRawKernelImpl(const Context& ctx,
+inline void CompareRawKernelImpl(const Context& dev_ctx,
                                  const DenseTensor& x,
                                  const DenseTensor& y,
                                  int axis,
                                  DenseTensor* out) {
-  ctx.template Alloc<bool>(out);
+  dev_ctx.template Alloc<bool>(out);
   if (x.dims().size() >= y.dims().size()) {
     funcs::ElementwiseCompute<Functor, T, bool>(
-        ctx, x, y, Functor(), out, axis);
+        dev_ctx, x, y, Functor(), out, axis);
   } else {
     funcs::ElementwiseCompute<InverseFunctor, T, bool>(
-        ctx, x, y, InverseFunctor(), out, axis);
+        dev_ctx, x, y, InverseFunctor(), out, axis);
   }
 }
 
 template <typename T, typename Context>
-void LessThanRawKernel(const Context& ctx,
+void LessThanRawKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& y,
                        int axis,
@@ -49,11 +49,11 @@ void LessThanRawKernel(const Context& ctx,
   CompareRawKernelImpl<T,
                        Context,
                        funcs::LessThanFunctor<T>,
-                       funcs::GreaterThanFunctor<T>>(ctx, x, y, axis, out);
+                       funcs::GreaterThanFunctor<T>>(dev_ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
-void LessEqualRawKernel(const Context& ctx,
+void LessEqualRawKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& y,
                         int axis,
@@ -61,11 +61,11 @@ void LessEqualRawKernel(const Context& ctx,
   CompareRawKernelImpl<T,
                        Context,
                        funcs::LessEqualFunctor<T>,
-                       funcs::GreaterEqualFunctor<T>>(ctx, x, y, axis, out);
+                       funcs::GreaterEqualFunctor<T>>(dev_ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
-void GreaterThanRawKernel(const Context& ctx,
+void GreaterThanRawKernel(const Context& dev_ctx,
                           const DenseTensor& x,
                           const DenseTensor& y,
                           int axis,
@@ -73,10 +73,10 @@ void GreaterThanRawKernel(const Context& ctx,
   CompareRawKernelImpl<T,
                        Context,
                        funcs::GreaterThanFunctor<T>,
-                       funcs::LessThanFunctor<T>>(ctx, x, y, axis, out);
+                       funcs::LessThanFunctor<T>>(dev_ctx, x, y, axis, out);
 }
 template <typename T, typename Context>
-void GreaterEqualRawKernel(const Context& ctx,
+void GreaterEqualRawKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& y,
                            int axis,
@@ -84,10 +84,10 @@ void GreaterEqualRawKernel(const Context& ctx,
   CompareRawKernelImpl<T,
                        Context,
                        funcs::GreaterEqualFunctor<T>,
-                       funcs::LessEqualFunctor<T>>(ctx, x, y, axis, out);
+                       funcs::LessEqualFunctor<T>>(dev_ctx, x, y, axis, out);
 }
 template <typename T, typename Context>
-void EqualRawKernel(const Context& ctx,
+void EqualRawKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& y,
                     int axis,
@@ -95,10 +95,10 @@ void EqualRawKernel(const Context& ctx,
   CompareRawKernelImpl<T,
                        Context,
                        funcs::EqualFunctor<T>,
-                       funcs::EqualFunctor<T>>(ctx, x, y, axis, out);
+                       funcs::EqualFunctor<T>>(dev_ctx, x, y, axis, out);
 }
 template <typename T, typename Context>
-void NotEqualRawKernel(const Context& ctx,
+void NotEqualRawKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& y,
                        int axis,
@@ -106,7 +106,7 @@ void NotEqualRawKernel(const Context& ctx,
   CompareRawKernelImpl<T,
                        Context,
                        funcs::NotEqualFunctor<T>,
-                       funcs::NotEqualFunctor<T>>(ctx, x, y, axis, out);
+                       funcs::NotEqualFunctor<T>>(dev_ctx, x, y, axis, out);
 }
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/legacy/cpu/legacy_generate_proposals_kernel.cc b/paddle/phi/kernels/legacy/cpu/legacy_generate_proposals_kernel.cc
index 5e6249249b1ee9..a77372f4592020 100644
--- a/paddle/phi/kernels/legacy/cpu/legacy_generate_proposals_kernel.cc
+++ b/paddle/phi/kernels/legacy/cpu/legacy_generate_proposals_kernel.cc
@@ -28,7 +28,7 @@ namespace phi {
 
 template <typename T>
 std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
-    const phi::CPUContext &ctx,
+    const phi::CPUContext &dev_ctx,
     const phi::DenseTensor &im_info_slice,
     const phi::DenseTensor &anchors,
     const phi::DenseTensor &variances,
@@ -44,7 +44,7 @@ std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
   // Sort index
   phi::DenseTensor index_t;
   index_t.Resize({scores_slice.numel()});
-  int *index = ctx.Alloc<int>(&index_t);
+  int *index = dev_ctx.Alloc<int>(&index_t);
   for (int i = 0; i < scores_slice.numel(); ++i) {
     index[i] = i;
   }
@@ -65,53 +65,54 @@ std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
   bbox_sel.Resize({index_t.numel(), 4});
   anchor_sel.Resize({index_t.numel(), 4});
   var_sel.Resize({index_t.numel(), 4});
-  ctx.Alloc<T>(&scores_sel);
-  ctx.Alloc<T>(&bbox_sel);
-  ctx.Alloc<T>(&anchor_sel);
-  ctx.Alloc<T>(&var_sel);
+  dev_ctx.Alloc<T>(&scores_sel);
+  dev_ctx.Alloc<T>(&bbox_sel);
+  dev_ctx.Alloc<T>(&anchor_sel);
+  dev_ctx.Alloc<T>(&var_sel);
 
-  phi::funcs::CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
-  phi::funcs::CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
-  phi::funcs::CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
-  phi::funcs::CPUGather<T>(ctx, variances, index_t, &var_sel);
+  phi::funcs::CPUGather<T>(dev_ctx, scores_slice, index_t, &scores_sel);
+  phi::funcs::CPUGather<T>(dev_ctx, bbox_deltas_slice, index_t, &bbox_sel);
+  phi::funcs::CPUGather<T>(dev_ctx, anchors, index_t, &anchor_sel);
+  phi::funcs::CPUGather<T>(dev_ctx, variances, index_t, &var_sel);
 
   phi::DenseTensor proposals;
   proposals.Resize({index_t.numel(), 4});
-  ctx.Alloc<T>(&proposals);
-  phi::funcs::BoxCoder<T>(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals);
+  dev_ctx.Alloc<T>(&proposals);
+  phi::funcs::BoxCoder<T>(
+      dev_ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals);
 
   phi::funcs::ClipTiledBoxes<T>(
-      ctx, im_info_slice, proposals, &proposals, false);
+      dev_ctx, im_info_slice, proposals, &proposals, false);
 
   phi::DenseTensor keep;
   phi::funcs::FilterBoxes<T>(
-      ctx, &proposals, min_size, im_info_slice, true, &keep);
+      dev_ctx, &proposals, min_size, im_info_slice, true, &keep);
   // Handle the case when there is no keep index left
   if (keep.numel() == 0) {
     phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
     bbox_sel.Resize({1, 4});
-    ctx.Alloc<T>(&bbox_sel);
-    set_zero(ctx, &bbox_sel, static_cast<T>(0));
+    dev_ctx.Alloc<T>(&bbox_sel);
+    set_zero(dev_ctx, &bbox_sel, static_cast<T>(0));
     phi::DenseTensor scores_filter;
     scores_filter.Resize({1, 1});
-    ctx.Alloc<T>(&scores_filter);
-    set_zero(ctx, &scores_filter, static_cast<T>(0));
+    dev_ctx.Alloc<T>(&scores_filter);
+    set_zero(dev_ctx, &scores_filter, static_cast<T>(0));
     return std::make_pair(bbox_sel, scores_filter);
   }
 
   phi::DenseTensor scores_filter;
   bbox_sel.Resize({keep.numel(), 4});
   scores_filter.Resize({keep.numel(), 1});
-  ctx.Alloc<T>(&bbox_sel);
-  ctx.Alloc<T>(&scores_filter);
-  phi::funcs::CPUGather<T>(ctx, proposals, keep, &bbox_sel);
-  phi::funcs::CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
+  dev_ctx.Alloc<T>(&bbox_sel);
+  dev_ctx.Alloc<T>(&scores_filter);
+  phi::funcs::CPUGather<T>(dev_ctx, proposals, keep, &bbox_sel);
+  phi::funcs::CPUGather<T>(dev_ctx, scores_sel, keep, &scores_filter);
   if (nms_thresh <= 0) {
     return std::make_pair(bbox_sel, scores_filter);
   }
 
   phi::DenseTensor keep_nms =
-      phi::funcs::NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta);
+      phi::funcs::NMS<T>(dev_ctx, &bbox_sel, &scores_filter, nms_thresh, eta);
 
   if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
     keep_nms.Resize({post_nms_top_n});
@@ -119,10 +120,10 @@ std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
 
   proposals.Resize({keep_nms.numel(), 4});
   scores_sel.Resize({keep_nms.numel(), 1});
-  ctx.Alloc<T>(&proposals);
-  ctx.Alloc<T>(&scores_sel);
-  phi::funcs::CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
-  phi::funcs::CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
+  dev_ctx.Alloc<T>(&proposals);
+  dev_ctx.Alloc<T>(&scores_sel);
+  phi::funcs::CPUGather<T>(dev_ctx, bbox_sel, keep_nms, &proposals);
+  phi::funcs::CPUGather<T>(dev_ctx, scores_filter, keep_nms, &scores_sel);
 
   return std::make_pair(proposals, scores_sel);
 }
diff --git a/paddle/phi/kernels/legacy/cpu/one_hot_kernel.cc b/paddle/phi/kernels/legacy/cpu/one_hot_kernel.cc
index 85347e71c606ff..d475c5fec98d94 100644
--- a/paddle/phi/kernels/legacy/cpu/one_hot_kernel.cc
+++ b/paddle/phi/kernels/legacy/cpu/one_hot_kernel.cc
@@ -25,20 +25,20 @@ struct OneHotV2OpFunctor {
   const DenseTensor* in_;
   DenseTensor* out_;
   int depth_;
-  const DeviceContext& ctx_;
+  const DeviceContext& dev_ctx_;
 
   OneHotV2OpFunctor(const DenseTensor* in,
                     DenseTensor* out,
                     int depth,
-                    const DeviceContext& ctx)
-      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
+                    const DeviceContext& dev_ctx)
+      : in_(in), out_(out), depth_(depth), dev_ctx_(dev_ctx) {}
 
   template <typename OutT>
   void apply() const {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
-    auto* p_out_data = ctx_.template Alloc<OutT>(out_);
-    funcs::set_constant(ctx_, out_, 0.0);
+    auto* p_out_data = dev_ctx_.template Alloc<OutT>(out_);
+    funcs::set_constant(dev_ctx_, out_, 0.0);
 
     for (int i = 0; i < numel; ++i) {
       PADDLE_ENFORCE_GE(
diff --git a/paddle/phi/kernels/legacy/gpu/layer_norm_cuda_kernel.h b/paddle/phi/kernels/legacy/gpu/layer_norm_cuda_kernel.h
index 39bd2837b9e451..f6d81228b34b68 100644
--- a/paddle/phi/kernels/legacy/gpu/layer_norm_cuda_kernel.h
+++ b/paddle/phi/kernels/legacy/gpu/layer_norm_cuda_kernel.h
@@ -943,7 +943,7 @@ void HostApplyRMSNorm(V* output,
 }
 
 template <typename T, typename Context>
-void cuda_rms_norm(const Context& ctx,
+void cuda_rms_norm(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& scale,
                    int rows,
@@ -960,7 +960,7 @@ void cuda_rms_norm(const Context& ctx,
       cols,                                                  \
       epsilon,                                               \
       const_cast<scalar_t_out*>(scale.data<scalar_t_out>()), \
-      ctx.stream())
+      dev_ctx.stream())
   // scale.dtype() same as y->dtype()
   if (scale.dtype() == phi::DataType::FLOAT32) {
     DISPATCH_FWD_CASE(float);
@@ -971,7 +971,7 @@ void cuda_rms_norm(const Context& ctx,
 }
 
 template <typename T, typename U, typename V, typename Context>
-void HostRMSNormGradient(const Context& ctx,
+void HostRMSNormGradient(const Context& dev_ctx,
                          const V* dout,
                          const U* invvar,
                          const DenseTensor& input,
@@ -992,7 +992,7 @@ void HostRMSNormGradient(const Context& ctx,
     const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b;
     auto place = input.place();
     DenseTensor part_grad_gamma =
-        phi::Empty<float, Context>(ctx, {part_size, n2});
+        phi::Empty<float, Context>(dev_ctx, {part_size, n2});
     cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
         dout,
         input.data<T>(),
@@ -1038,7 +1038,7 @@ void HostRMSNormGradient(const Context& ctx,
 }
 
 template <typename T, typename Context>
-void cuda_rms_norm_gradient(const Context& ctx,
+void cuda_rms_norm_gradient(const Context& dev_ctx,
                             const DenseTensor& x,
                             const DenseTensor& scale,
                             const DenseTensor& invvar,
@@ -1050,7 +1050,7 @@ void cuda_rms_norm_gradient(const Context& ctx,
                             DenseTensor* grad_scale) {
 #define DISPATCH_BWD_CASE(scalar_t_out)                 \
   HostRMSNormGradient<T, float, scalar_t_out, Context>( \
-      ctx,                                              \
+      dev_ctx,                                          \
       dy.data<scalar_t_out>(),                          \
       invvar.data<float>(),                             \
       x,                                                \
@@ -1060,7 +1060,7 @@ void cuda_rms_norm_gradient(const Context& ctx,
       epsilon,                                          \
       grad_x->data<T>(),                                \
       grad_scale->data<scalar_t_out>(),                 \
-      ctx.stream())
+      dev_ctx.stream())
   if (scale.dtype() == phi::DataType::FLOAT32) {
     DISPATCH_BWD_CASE(float);
   } else if (scale.dtype() == phi::DataType::BFLOAT16) {
diff --git a/paddle/phi/kernels/legacy/gpu/legacy_generate_proposals_kernel.cu b/paddle/phi/kernels/legacy/gpu/legacy_generate_proposals_kernel.cu
index c7630a3717a41f..90e1a9f1c498aa 100644
--- a/paddle/phi/kernels/legacy/gpu/legacy_generate_proposals_kernel.cu
+++ b/paddle/phi/kernels/legacy/gpu/legacy_generate_proposals_kernel.cu
@@ -30,7 +30,7 @@ namespace phi {
 namespace {
 template <typename T>
 static std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
-    const phi::GPUContext &ctx,
+    const phi::GPUContext &dev_ctx,
     const phi::DenseTensor &im_info,
     const phi::DenseTensor &anchors,
     const phi::DenseTensor &variances,
@@ -43,7 +43,7 @@ static std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
     float eta) {
   // 1. pre nms
   phi::DenseTensor scores_sort, index_sort;
-  phi::funcs::SortDescending<T>(ctx, scores, &scores_sort, &index_sort);
+  phi::funcs::SortDescending<T>(dev_ctx, scores, &scores_sort, &index_sort);
   int num = scores.numel();
   int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
                                                                 : pre_nms_top_n;
@@ -53,10 +53,10 @@ static std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
   // 2. box decode and clipping
   phi::DenseTensor proposals;
   proposals.Resize({pre_nms_num, 4});
-  ctx.Alloc<T>(&proposals);
+  dev_ctx.Alloc<T>(&proposals);
 
   {
-    phi::funcs::ForRange<phi::GPUContext> for_range(ctx, pre_nms_num);
+    phi::funcs::ForRange<phi::GPUContext> for_range(dev_ctx, pre_nms_num);
     for_range(phi::funcs::BoxDecodeAndClipFunctor<T>{anchors.data<T>(),
                                                      bbox_deltas.data<T>(),
                                                      variances.data<T>(),
@@ -69,10 +69,10 @@ static std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
   phi::DenseTensor keep_index, keep_num_t;
   keep_index.Resize({pre_nms_num});
   keep_num_t.Resize({1});
-  ctx.Alloc<int>(&keep_index);
-  ctx.Alloc<int>(&keep_num_t);
+  dev_ctx.Alloc<int>(&keep_index);
+  dev_ctx.Alloc<int>(&keep_num_t);
   min_size = std::max(min_size, 1.0f);
-  auto stream = ctx.stream();
+  auto stream = dev_ctx.stream();
   phi::funcs::FilterBBoxes<T, 512>
       <<<1, 512, 0, stream>>>(proposals.data<T>(),
                               im_info.data<T>(),
@@ -81,14 +81,14 @@ static std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
                               keep_num_t.data<int>(),
                               keep_index.data<int>());
   int keep_num;
-  const auto gpu_place = ctx.GetPlace();
+  const auto gpu_place = dev_ctx.GetPlace();
   phi::memory_utils::Copy(phi::CPUPlace(),
                           &keep_num,
                           gpu_place,
                           keep_num_t.data<int>(),
                           sizeof(int),
-                          ctx.stream());
-  ctx.Wait();
+                          dev_ctx.stream());
+  dev_ctx.Wait();
   keep_index.Resize({keep_num});
 
   phi::DenseTensor scores_filter, proposals_filter;
@@ -97,18 +97,18 @@ static std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
     phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
     proposals_filter.Resize({1, 4});
     scores_filter.Resize({1, 1});
-    ctx.Alloc<T>(&proposals_filter);
-    ctx.Alloc<T>(&scores_filter);
-    set_zero(ctx, &proposals_filter, static_cast<T>(0));
-    set_zero(ctx, &scores_filter, static_cast<T>(0));
+    dev_ctx.Alloc<T>(&proposals_filter);
+    dev_ctx.Alloc<T>(&scores_filter);
+    set_zero(dev_ctx, &proposals_filter, static_cast<T>(0));
+    set_zero(dev_ctx, &scores_filter, static_cast<T>(0));
     return std::make_pair(proposals_filter, scores_filter);
   }
   proposals_filter.Resize({keep_num, 4});
   scores_filter.Resize({keep_num, 1});
-  ctx.Alloc<T>(&proposals_filter);
-  ctx.Alloc<T>(&scores_filter);
-  phi::funcs::GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
-  phi::funcs::GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
+  dev_ctx.Alloc<T>(&proposals_filter);
+  dev_ctx.Alloc<T>(&scores_filter);
+  phi::funcs::GPUGather<T>(dev_ctx, proposals, keep_index, &proposals_filter);
+  phi::funcs::GPUGather<T>(dev_ctx, scores_sort, keep_index, &scores_filter);
 
   if (nms_thresh <= 0) {
     return std::make_pair(proposals_filter, scores_filter);
@@ -116,7 +116,8 @@ static std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
 
   // 4. nms
   phi::DenseTensor keep_nms;
-  phi::funcs::NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms);
+  phi::funcs::NMS<T>(
+      dev_ctx, proposals_filter, keep_index, nms_thresh, &keep_nms);
   if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
     keep_nms.Resize({post_nms_top_n});
   }
@@ -124,10 +125,10 @@ static std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
   phi::DenseTensor scores_nms, proposals_nms;
   proposals_nms.Resize({keep_nms.numel(), 4});
   scores_nms.Resize({keep_nms.numel(), 1});
-  ctx.Alloc<T>(&proposals_nms);
-  ctx.Alloc<T>(&scores_nms);
-  phi::funcs::GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
-  phi::funcs::GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
+  dev_ctx.Alloc<T>(&proposals_nms);
+  dev_ctx.Alloc<T>(&scores_nms);
+  phi::funcs::GPUGather<T>(dev_ctx, proposals_filter, keep_nms, &proposals_nms);
+  phi::funcs::GPUGather<T>(dev_ctx, scores_filter, keep_nms, &scores_nms);
 
   return std::make_pair(proposals_nms, scores_nms);
 }
diff --git a/paddle/phi/kernels/legacy/gpu/one_hot_kernel.cu b/paddle/phi/kernels/legacy/gpu/one_hot_kernel.cu
index c64f2e2d755662..8030231e7fa025 100644
--- a/paddle/phi/kernels/legacy/gpu/one_hot_kernel.cu
+++ b/paddle/phi/kernels/legacy/gpu/one_hot_kernel.cu
@@ -44,24 +44,24 @@ template <typename DeviceContext, typename InT>
 struct OneHotV2OpCUDAFunctor {
   const DenseTensor* in_;
   DenseTensor* out_;
-  const DeviceContext& ctx_;
+  const DeviceContext& dev_ctx_;
   int depth_;
 
   OneHotV2OpCUDAFunctor(const DenseTensor* in,
                         DenseTensor* out,
                         int depth,
-                        const DeviceContext& ctx)
-      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
+                        const DeviceContext& dev_ctx)
+      : in_(in), out_(out), depth_(depth), dev_ctx_(dev_ctx) {}
 
   template <typename OutT>
   void apply() const {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
-    auto* p_out_data = ctx_.template Alloc<OutT>(out_);
-    auto stream = ctx_.stream();
-    funcs::set_constant(ctx_, out_, 0.0);
+    auto* p_out_data = dev_ctx_.template Alloc<OutT>(out_);
+    auto stream = dev_ctx_.stream();
+    funcs::set_constant(dev_ctx_, out_, 0.0);
 
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx_, numel);
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx_, numel);
 
     FillOutputKernel<<<config.block_per_grid,
                        config.thread_per_block,
diff --git a/paddle/phi/kernels/legacy/kps/compare_kernel.cu b/paddle/phi/kernels/legacy/kps/compare_kernel.cu
index c3aaea92ead2d0..5f4b4ebf1f304e 100644
--- a/paddle/phi/kernels/legacy/kps/compare_kernel.cu
+++ b/paddle/phi/kernels/legacy/kps/compare_kernel.cu
@@ -34,77 +34,77 @@
 namespace phi {
 
 template <typename T, typename Context, typename Functor>
-inline void CompareRawKernelImpl(const Context& ctx,
+inline void CompareRawKernelImpl(const Context& dev_ctx,
                                  const DenseTensor& x,
                                  const DenseTensor& y,
                                  int axis,
                                  DenseTensor* out) {
-  ctx.template Alloc<bool>(out);
+  dev_ctx.template Alloc<bool>(out);
   out->set_type(phi::DataType::BOOL);
   if (out->numel() == 0) return;
   std::vector<const DenseTensor*> ins{&x, &y};
   std::vector<DenseTensor*> outs{out};
-  funcs::BroadcastKernel<bool>(ctx, ins, &outs, Functor(), axis);
+  funcs::BroadcastKernel<bool>(dev_ctx, ins, &outs, Functor(), axis);
 }
 
 template <typename T, typename Context>
-void LessThanRawKernel(const Context& ctx,
+void LessThanRawKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& y,
                        int axis,
                        DenseTensor* out) {
   CompareRawKernelImpl<T, Context, funcs::LessThanFunctor<T>>(
-      ctx, x, y, axis, out);
+      dev_ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
-void LessEqualRawKernel(const Context& ctx,
+void LessEqualRawKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& y,
                         int axis,
                         DenseTensor* out) {
   CompareRawKernelImpl<T, Context, funcs::LessEqualFunctor<T>>(
-      ctx, x, y, axis, out);
+      dev_ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
-void GreaterThanRawKernel(const Context& ctx,
+void GreaterThanRawKernel(const Context& dev_ctx,
                           const DenseTensor& x,
                           const DenseTensor& y,
                           int axis,
                           DenseTensor* out) {
   CompareRawKernelImpl<T, Context, funcs::GreaterThanFunctor<T>>(
-      ctx, x, y, axis, out);
+      dev_ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
-void GreaterEqualRawKernel(const Context& ctx,
+void GreaterEqualRawKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& y,
                            int axis,
                            DenseTensor* out) {
   CompareRawKernelImpl<T, Context, funcs::GreaterEqualFunctor<T>>(
-      ctx, x, y, axis, out);
+      dev_ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
-void EqualRawKernel(const Context& ctx,
+void EqualRawKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& y,
                     int axis,
                     DenseTensor* out) {
   CompareRawKernelImpl<T, Context, funcs::EqualFunctor<T>>(
-      ctx, x, y, axis, out);
+      dev_ctx, x, y, axis, out);
 }
 
 template <typename T, typename Context>
-void NotEqualRawKernel(const Context& ctx,
+void NotEqualRawKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& y,
                        int axis,
                        DenseTensor* out) {
   CompareRawKernelImpl<T, Context, funcs::NotEqualFunctor<T>>(
-      ctx, x, y, axis, out);
+      dev_ctx, x, y, axis, out);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/legacy/xpu/compare_kernel.cc b/paddle/phi/kernels/legacy/xpu/compare_kernel.cc
index 3d461517ac6f22..4253b86915d45e 100644
--- a/paddle/phi/kernels/legacy/xpu/compare_kernel.cc
+++ b/paddle/phi/kernels/legacy/xpu/compare_kernel.cc
@@ -61,13 +61,13 @@ void XPUCompareRawKernelImpl(
                        int axis,                                         \
                        DenseTensor* out) {                               \
     using XPUType = typename XPUTypeTrait<T>::Type;                      \
-    auto f = [](xpu::Context* ctx,                                       \
+    auto f = [](xpu::Context* xpu_ctx,                                   \
                 const XPUType* x,                                        \
                 const XPUType* y,                                        \
                 bool* z,                                                 \
                 const std::vector<int64_t>& xshape,                      \
                 const std::vector<int64_t>& yshape) {                    \
-      return functor(ctx, x, y, z, xshape, yshape);                      \
+      return functor(xpu_ctx, x, y, z, xshape, yshape);                  \
     };                                                                   \
     XPUCompareRawKernelImpl<T, XPUType, Context>(dev_ctx, x, y, out, f); \
   }
diff --git a/paddle/phi/kernels/legacy/xpu/elementwise_add_kernel.cc b/paddle/phi/kernels/legacy/xpu/elementwise_add_kernel.cc
index 2ca79cd26160b3..b3a891f280f662 100644
--- a/paddle/phi/kernels/legacy/xpu/elementwise_add_kernel.cc
+++ b/paddle/phi/kernels/legacy/xpu/elementwise_add_kernel.cc
@@ -36,13 +36,13 @@ void AddRawKernel(const Context& dev_ctx,
                   DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
 
-  auto f = [](xpu::Context* ctx,
+  auto f = [](xpu::Context* xpu_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
               const std::vector<int64_t>& xshape,
               const std::vector<int64_t>& yshape) {
-    return xpu::broadcast_add<XPUType>(ctx, x, y, z, xshape, yshape);
+    return xpu::broadcast_add<XPUType>(xpu_ctx, x, y, z, xshape, yshape);
   };
 
   XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
diff --git a/paddle/phi/kernels/legacy/xpu/elementwise_divide_kernel.cc b/paddle/phi/kernels/legacy/xpu/elementwise_divide_kernel.cc
index 3fed6a52fdff48..d87bf7362581b8 100644
--- a/paddle/phi/kernels/legacy/xpu/elementwise_divide_kernel.cc
+++ b/paddle/phi/kernels/legacy/xpu/elementwise_divide_kernel.cc
@@ -31,13 +31,13 @@ void DivideRawKernel(const Context& dev_ctx,
                      int axis,
                      DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
+  auto f = [](xpu::Context* xpu_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
               const std::vector<int64_t>& xshape,
               const std::vector<int64_t>& yshape) {
-    return xpu::broadcast_div<XPUType>(ctx, x, y, z, xshape, yshape);
+    return xpu::broadcast_div<XPUType>(xpu_ctx, x, y, z, xshape, yshape);
   };
 
   XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
diff --git a/paddle/phi/kernels/legacy/xpu/elementwise_kernel.cc b/paddle/phi/kernels/legacy/xpu/elementwise_kernel.cc
index 0825014319dfe9..ce9aa48b883b26 100644
--- a/paddle/phi/kernels/legacy/xpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/legacy/xpu/elementwise_kernel.cc
@@ -31,13 +31,13 @@ void MaximumRawKernel(const Context& dev_ctx,
   }
 
   using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
+  auto f = [](xpu::Context* xpu_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
               const std::vector<int64_t>& xshape,
               const std::vector<int64_t>& yshape) {
-    return xpu::broadcast_max<XPUType>(ctx, x, y, z, xshape, yshape);
+    return xpu::broadcast_max<XPUType>(xpu_ctx, x, y, z, xshape, yshape);
   };
 
   XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
@@ -55,13 +55,13 @@ void MinimumRawKernel(const Context& dev_ctx,
   }
 
   using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
+  auto f = [](xpu::Context* xpu_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
               const std::vector<int64_t>& xshape,
               const std::vector<int64_t>& yshape) {
-    return xpu::broadcast_min<XPUType>(ctx, x, y, z, xshape, yshape);
+    return xpu::broadcast_min<XPUType>(xpu_ctx, x, y, z, xshape, yshape);
   };
 
   XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
@@ -74,13 +74,13 @@ void RemainderRawKernel(const Context& dev_ctx,
                         int axis,
                         DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
+  auto f = [](xpu::Context* xpu_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
               const std::vector<int64_t>& xshape,
               const std::vector<int64_t>& yshape) {
-    return xpu::broadcast_mod<XPUType>(ctx, x, y, z, xshape, yshape);
+    return xpu::broadcast_mod<XPUType>(xpu_ctx, x, y, z, xshape, yshape);
   };
 
   XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
@@ -93,13 +93,13 @@ void FloorDivideRawKernel(const Context& dev_ctx,
                           int axis,
                           DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
+  auto f = [](xpu::Context* xpu_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
               const std::vector<int64_t>& xshape,
               const std::vector<int64_t>& yshape) {
-    return xpu::broadcast_floordiv<XPUType>(ctx, x, y, z, xshape, yshape);
+    return xpu::broadcast_floordiv<XPUType>(xpu_ctx, x, y, z, xshape, yshape);
   };
 
   XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
@@ -112,13 +112,13 @@ void ElementwisePowRawKernel(const Context& dev_ctx,
                              int axis,
                              DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
+  auto f = [](xpu::Context* xpu_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
               const std::vector<int64_t>& xshape,
               const std::vector<int64_t>& yshape) {
-    return xpu::broadcast_pow<XPUType>(ctx, x, y, z, xshape, yshape);
+    return xpu::broadcast_pow<XPUType>(xpu_ctx, x, y, z, xshape, yshape);
   };
 
   XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
diff --git a/paddle/phi/kernels/legacy/xpu/elementwise_multiply_kernel.cc b/paddle/phi/kernels/legacy/xpu/elementwise_multiply_kernel.cc
index d64499498ae8b7..e3cf1e7f377f20 100644
--- a/paddle/phi/kernels/legacy/xpu/elementwise_multiply_kernel.cc
+++ b/paddle/phi/kernels/legacy/xpu/elementwise_multiply_kernel.cc
@@ -31,13 +31,13 @@ void MultiplyRawKernel(const Context& dev_ctx,
                        int axis,
                        DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
+  auto f = [](xpu::Context* xpu_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
               const std::vector<int64_t>& xshape,
               const std::vector<int64_t>& yshape) {
-    return xpu::broadcast_mul<XPUType>(ctx, x, y, z, xshape, yshape);
+    return xpu::broadcast_mul<XPUType>(xpu_ctx, x, y, z, xshape, yshape);
   };
 
   XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
diff --git a/paddle/phi/kernels/legacy/xpu/elementwise_subtract_kernel.cc b/paddle/phi/kernels/legacy/xpu/elementwise_subtract_kernel.cc
index bf5ea1381965ff..231b84a8dd91a4 100644
--- a/paddle/phi/kernels/legacy/xpu/elementwise_subtract_kernel.cc
+++ b/paddle/phi/kernels/legacy/xpu/elementwise_subtract_kernel.cc
@@ -26,13 +26,13 @@ void SubtractRawKernel(const Context& dev_ctx,
                        int axis,
                        DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
+  auto f = [](xpu::Context* xpu_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
               const std::vector<int64_t>& xshape,
               const std::vector<int64_t>& yshape) {
-    return xpu::broadcast_sub<XPUType>(ctx, x, y, z, xshape, yshape);
+    return xpu::broadcast_sub<XPUType>(xpu_ctx, x, y, z, xshape, yshape);
   };
 
   phi::XPUElementwise<T, XPUType>(dev_ctx, x, y, axis, out, f);
diff --git a/paddle/phi/kernels/legacy/xpu/one_hot_kernel.cc b/paddle/phi/kernels/legacy/xpu/one_hot_kernel.cc
index 02edbd128430b5..76903f89660e77 100644
--- a/paddle/phi/kernels/legacy/xpu/one_hot_kernel.cc
+++ b/paddle/phi/kernels/legacy/xpu/one_hot_kernel.cc
@@ -25,21 +25,21 @@ struct OneHotV2OpFunctor {
   const DenseTensor* in_;
   DenseTensor* out_;
   int depth_;
-  const Context& ctx_;
+  const Context& dev_ctx_;
 
   OneHotV2OpFunctor(const DenseTensor* in,
                     DenseTensor* out,
                     int depth,
-                    const Context& ctx)
-      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
+                    const Context& dev_ctx)
+      : in_(in), out_(out), depth_(depth), dev_ctx_(dev_ctx) {}
 
   template <typename OutT>
   void apply() const {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
-    auto* p_out_data = ctx_.template Alloc<float>(out_);
+    auto* p_out_data = dev_ctx_.template Alloc<float>(out_);
     int r = xpu::one_hot<InT>(
-        ctx_.x_context(), p_in_data, p_out_data, numel, depth_, 1.0, 0.0);
+        dev_ctx_.x_context(), p_in_data, p_out_data, numel, depth_, 1.0, 0.0);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "one_hot");
   }
 };
diff --git a/paddle/phi/kernels/legacy/xpu/reduce_max_kernel.cc b/paddle/phi/kernels/legacy/xpu/reduce_max_kernel.cc
index 4cb8d9d0439249..8c5881603e2e61 100644
--- a/paddle/phi/kernels/legacy/xpu/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/legacy/xpu/reduce_max_kernel.cc
@@ -30,12 +30,12 @@ void MaxRawKernel(const Context& dev_ctx,
                   DenseTensor* out) {
   reduce_all = recompute_reduce_all(x, dims, reduce_all);
   using XPUType = typename XPUTypeTrait<T>::Type;
-  auto f = [](xpu::Context* ctx,
+  auto f = [](xpu::Context* xpu_ctx,
               const T* x,
               T* y,
               const std::vector<int64_t>& xdims,
               const std::vector<int64_t>& reduce_dims) {
-    return xpu::reduce_max<XPUType>(ctx,
+    return xpu::reduce_max<XPUType>(xpu_ctx,
                                     reinterpret_cast<const XPUType*>(x),
                                     reinterpret_cast<XPUType*>(y),
                                     xdims,
diff --git a/paddle/phi/kernels/min_max_with_index_kernel.h b/paddle/phi/kernels/min_max_with_index_kernel.h
new file mode 100644
index 00000000000000..eca50fc3a752e8
--- /dev/null
+++ b/paddle/phi/kernels/min_max_with_index_kernel.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MinWithIndexKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const Scalar& dim,
+                        bool keepdim,
+                        bool flatten,
+                        DenseTensor* val_out,
+                        DenseTensor* ind_out);
+
+template <typename T, typename Context>
+void MaxWithIndexKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const Scalar& dim,
+                        bool keepdim,
+                        bool flatten,
+                        DenseTensor* val_out,
+                        DenseTensor* ind_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/onednn/gaussian_kernel.cc b/paddle/phi/kernels/onednn/gaussian_kernel.cc
index 98197961a9df6b..61cdb580008611 100644
--- a/paddle/phi/kernels/onednn/gaussian_kernel.cc
+++ b/paddle/phi/kernels/onednn/gaussian_kernel.cc
@@ -20,7 +20,7 @@
 namespace phi {
 
 template <typename T, typename Context>
-void GaussianKernel(const Context& ctx,
+void GaussianKernel(const Context& dev_ctx,
                     const IntArray& shape,
                     float mean,
                     float std,
@@ -33,10 +33,10 @@ void GaussianKernel(const Context& ctx,
     engine = std::make_shared<std::mt19937_64>();
     engine->seed(seed);
   } else {
-    engine = ctx.GetGenerator()->GetCPUEngine();
+    engine = dev_ctx.GetGenerator()->GetCPUEngine();
   }
 
-  T* data = ctx.template Alloc<T>(out);
+  T* data = dev_ctx.template Alloc<T>(out);
   for (int64_t i = 0; i < out->numel(); ++i) {
     data[i] = dist(*engine);
   }
diff --git a/paddle/phi/kernels/stride/index_select_kernel.cc b/paddle/phi/kernels/stride/index_select_kernel.cc
index 0f3a8aae1e4e71..6db84f5c89180f 100644
--- a/paddle/phi/kernels/stride/index_select_kernel.cc
+++ b/paddle/phi/kernels/stride/index_select_kernel.cc
@@ -25,7 +25,7 @@ COMMON_DECLARE_bool(use_stride_kernel);
 namespace phi {
 
 template <typename Context>
-void IndexSelectStridedKernel(const Context& ctx,
+void IndexSelectStridedKernel(const Context& dev_ctx,
                               const DenseTensor& x,
                               int64_t index,
                               int dim,
diff --git a/paddle/phi/kernels/stride/slice_kernel.cc b/paddle/phi/kernels/stride/slice_kernel.cc
index fe65a002b67df6..bff461867c37d9 100644
--- a/paddle/phi/kernels/stride/slice_kernel.cc
+++ b/paddle/phi/kernels/stride/slice_kernel.cc
@@ -26,7 +26,7 @@ COMMON_DECLARE_bool(use_stride_kernel);
 namespace phi {
 
 template <typename Context>
-void SliceStridedKernel(const Context& ctx,
+void SliceStridedKernel(const Context& dev_ctx,
                         const DenseTensor& input,
                         const std::vector<int64_t>& axes,
                         const IntArray& starts_arr,
diff --git a/paddle/phi/kernels/stride/transpose_kernel.cc b/paddle/phi/kernels/stride/transpose_kernel.cc
index aaa4773f60808f..a5d627c4613267 100644
--- a/paddle/phi/kernels/stride/transpose_kernel.cc
+++ b/paddle/phi/kernels/stride/transpose_kernel.cc
@@ -22,7 +22,7 @@ COMMON_DECLARE_bool(use_stride_kernel);
 namespace phi {
 
 template <typename Context>
-void TransposeStridedKernel(const Context& ctx,
+void TransposeStridedKernel(const Context& dev_ctx,
                             const DenseTensor& x,
                             const std::vector<int>& axis,
                             DenseTensor* out) {
diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml
index 4760d51061c0f1..6a931d443605be 100644
--- a/paddle/phi/ops/yaml/backward.yaml
+++ b/paddle/phi/ops/yaml/backward.yaml
@@ -2277,6 +2277,16 @@
   kernel :
     func : max_pool3d_with_index_grad
 
+- backward_op : max_with_index_grad
+  forward : max_with_index (Tensor x, Scalar dim, bool keepdim, bool flatten) -> Tensor(values), Tensor(indices)
+  args : (Tensor x, Tensor values, Tensor indices, Tensor values_grad, Scalar dim, bool keepdim)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : max_with_index_grad
+
 - backward_op : maxout_grad
   forward : maxout(Tensor x, int groups, int axis) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, int groups, int axis)
@@ -2340,6 +2350,16 @@
     func : meshgrid_grad
     data_type : out_grad
 
+- backward_op : min_with_index_grad
+  forward : min_with_index (Tensor x, Scalar dim, bool keepdim, bool flatten) -> Tensor(values), Tensor(indices)
+  args : (Tensor x, Tensor values, Tensor indices, Tensor values_grad, Scalar dim, bool keepdim)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : min_with_index_grad
+
 - backward_op : mish_grad
   forward : mish (Tensor x, float lambda) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float lambda)
diff --git a/paddle/phi/ops/yaml/fused_ops.yaml b/paddle/phi/ops/yaml/fused_ops.yaml
index 291147c33367bf..991b1ab8c0ab6d 100644
--- a/paddle/phi/ops/yaml/fused_ops.yaml
+++ b/paddle/phi/ops/yaml/fused_ops.yaml
@@ -916,12 +916,13 @@
   support_dygraph_mode : true
 
 - op: fused_transpose_split_quant
-  args: (Tensor x, IntArray tokens_per_expert, bool pow_2_scales=false)
+  args: (Tensor x, Tensor input_scales, IntArray tokens_per_expert, bool pow_2_scales=false)
   output: Tensor[](out){tokens_per_expert.size()}, Tensor[](scales){tokens_per_expert.size()}
   infer_meta:
       func: FusedTransposeSplitQuantInferMeta
   kernel:
       func: fused_transpose_split_quant
+  optional: input_scales
   support_dygraph_mode : true
 
 - op: fused_weighted_swiglu_act_quant
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
index 590055b43b9ba6..78f836e842cc3c 100644
--- a/paddle/phi/ops/yaml/ops.yaml
+++ b/paddle/phi/ops/yaml/ops.yaml
@@ -3553,6 +3553,17 @@
   backward : max_pool3d_with_index_grad
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
+- op : max_with_index
+  args : (Tensor x, Scalar(int64_t) dim, bool keepdim = false, bool flatten = false)
+  output : Tensor(values), Tensor(indices)
+  infer_meta :
+    func : MinMaxWithIndexInferMeta
+  kernel :
+    func : max_with_index
+    data_type : x
+  backward : max_with_index_grad
+  interfaces : paddle::dialect::InferSymbolicShapeInterface, paddle::dialect::LayoutTransformationInterface
+
 - op : maxout
   args : (Tensor x, int groups, int axis = 1)
   output : Tensor(out)
@@ -3662,6 +3673,17 @@
   backward : meshgrid_grad
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
+- op : min_with_index
+  args : (Tensor x, Scalar(int64_t) dim, bool keepdim = false, bool flatten = false)
+  output : Tensor(values), Tensor(indices)
+  infer_meta :
+    func : MinMaxWithIndexInferMeta
+  kernel :
+    func : min_with_index
+    data_type : x
+  backward : min_with_index_grad
+  interfaces : paddle::dialect::InferSymbolicShapeInterface, paddle::dialect::LayoutTransformationInterface
+
 - op : mish
   args : (Tensor x, float lambda)
   output : Tensor
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 4ebc15fdc9753c..53680e172adcd6 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -122,6 +122,7 @@
     _pir_ops as _pir_ops,
     _typing as _typing,
     callbacks as callbacks,
+    compat as compat,
     fft as fft,
     hub as hub,
     linalg as linalg,
diff --git a/python/paddle/compat.py b/python/paddle/compat.py
new file mode 100644
index 00000000000000..1eef54a83fd2b8
--- /dev/null
+++ b/python/paddle/compat.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .tensor.compat import (
+    max,
+    min,
+    split,
+)
+
+__all__ = [
+    'split',
+    'min',
+    'max',
+]
diff --git a/python/paddle/incubate/nn/functional/fp8.py b/python/paddle/incubate/nn/functional/fp8.py
index 7c524b865ee96b..be61e7bdb72ae3 100644
--- a/python/paddle/incubate/nn/functional/fp8.py
+++ b/python/paddle/incubate/nn/functional/fp8.py
@@ -173,7 +173,9 @@ def fused_swiglu_weighted_bwd(
         return _C_ops.fused_swiglu_weighted_bwd(o1, do2_s, unzipped_probs)
 
 
-def fused_transpose_split_quant(x, tokens_per_expert, pow_2_scales=False):
+def fused_transpose_split_quant(
+    x, input_scales, tokens_per_expert, pow_2_scales=False
+):
     """
     Applies fused transpose, split, and quantization operation for Mixture of Experts (MoE) models.
 
@@ -215,7 +217,7 @@ def fused_transpose_split_quant(x, tokens_per_expert, pow_2_scales=False):
             >>> x = paddle.randn([384, 512], dtype='bfloat16')
             >>> x = paddle.clip(x, min=-50, max=50)
             >>> tokens_per_expert = [128, 128, 128]
-            >>> outs, scales = F.fused_transpose_split_quant(x, tokens_per_expert, pow_2_scales=True)
+            >>> outs, scales = F.fused_transpose_split_quant(x,None, tokens_per_expert, pow_2_scales=True)
             >>> print(outs[0].shape)
             [512, 128]
             >>> print(scales[0].shape)
@@ -228,7 +230,7 @@ def fused_transpose_split_quant(x, tokens_per_expert, pow_2_scales=False):
 
     if in_dynamic_or_pir_mode():
         return _C_ops.fused_transpose_split_quant(
-            x, tokens_per_expert, pow_2_scales
+            x, input_scales, tokens_per_expert, pow_2_scales
         )
 
 
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
index b3fe014b27a350..0d650d8fed519e 100644
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -717,6 +717,7 @@ def _dygraph_clip(self, params_grads):
         sum_square_list = []
         sum_square_list_fp16 = []
         sum_square_list_fp32 = []
+        flag_auto_hybrid_pp = True  # Determine whether to use the new dynamic graph semi-automatic parallel pp framework
         if len(params_grads) > 0 and len(params_grads[0]) > 0:
             src_mesh = params_grads[0][0].process_mesh
         else:
@@ -742,6 +743,7 @@ def _dygraph_clip(self, params_grads):
             # if the gradient mesh is not equal to src mesh
             # do reshard to get the result of squared_l2 from other pp stage mesh
             if src_mesh is not None and g.process_mesh != src_mesh:
+                flag_auto_hybrid_pp = False
                 pp_mesh = get_complete_pp_mesh(g.process_mesh)
                 if set(g.process_mesh.process_ids) < set(pp_mesh.process_ids):
                     sum_square = dist.reshard(
@@ -791,6 +793,37 @@ def async_add_n(var_list):
 
         global_norm_var = async_add_n(global_norm_var)
 
+        # NOTE(zhengtianyu): Fix grad_clip in auto_hybrid_pp mode.
+        # Reason: In auto_hybrid_pp mode, each rank only keeps local parameters and gradient information,
+        # so global_norm_var is in a partial state, leading to incorrect calculation.
+        # Reference dynamic manual-parallel: Each rank computes local global_norm_var,
+        # then performs pp group communication reduce(sum) to get correct global_norm_var.
+        # For complete alignment with old dygraph semi-auto parallel PP logic,
+        # refer to NOTE: align ClipGradByGlobalNorm in auto_parallel_align_mode
+        if flag_auto_hybrid_pp and src_mesh is not None:
+            g_mesh = dist.get_mesh()
+            if (
+                g_mesh
+                and "pp" in g_mesh.dim_names
+                and g_mesh.get_dim_size("pp") > 1
+            ):
+                # Get the pipeline parallelism subgroup for communication
+                pp_group = g_mesh.get_submesh_with_dim("pp").get_group("pp")
+
+                # Perform all-reduce on the local tensor value across the PP group
+                global_norm_var_local = global_norm_var._local_value()
+                dist.all_reduce(
+                    global_norm_var_local,
+                    op=dist.ReduceOp.SUM,
+                    group=pp_group,
+                )
+
+                global_norm_var = dist.shard_tensor(
+                    global_norm_var_local,
+                    global_norm_var.process_mesh,
+                    global_norm_var.placements,
+                )
+
         if self.should_comm_on_shard_dim and hasattr(self, 'sharding_group'):
             paddle.distributed.all_reduce(
                 global_norm_var._local_value(), group=self.sharding_group
diff --git a/python/paddle/static/quantization/quant_int8_onednn_pass.py b/python/paddle/static/quantization/quant_int8_onednn_pass.py
index 2387e8bd9b70f7..909a94427c9718 100644
--- a/python/paddle/static/quantization/quant_int8_onednn_pass.py
+++ b/python/paddle/static/quantization/quant_int8_onednn_pass.py
@@ -177,7 +177,7 @@ def _transform_to_conv_onednn(self, graph, op_node):
         conv_op_node.set_attr("Scale_weights", scale_w)
         conv_op_node.set_attr("Scale_in", scale_in)
         conv_op_node.set_attr("Scale_out", 1.0)
-        conv_op_node.set_attr("use_mkldnn", 1)
+        conv_op_node.set_attr("use_onednn", 1)
         conv_op_node.set_attr("force_fp32_output", 1)
         graph.link_to(input_var_node, conv_op_node)
         graph.link_to(weight_var_node, conv_op_node)
@@ -223,7 +223,7 @@ def _transform_to_mul_onednn(self, graph, op_node):
         mul_op_node.set_attr("scale_y", scale_w)
         mul_op_node.set_attr("scale_x", scale_in)
         mul_op_node.set_attr("scale_out", 1.0)
-        mul_op_node.set_attr("use_mkldnn", 1)
+        mul_op_node.set_attr("use_onednn", 1)
         mul_op_node.set_attr("force_fp32_output", 1)
         graph.link_to(input_var_node, mul_op_node)
         graph.link_to(weight_var_node, mul_op_node)
@@ -248,7 +248,7 @@ def _transform_to_quantize_onednn(self, graph, op_node):
             op_type='quantize',
             attrs={
                 'data_format': 'ONEDNNLAYOUT',
-                'use_mkldnn': 1,
+                'use_onednn': 1,
                 'Scale': scale_in,
                 'is_negative_input': 1,
             },
diff --git a/python/paddle/tensor/compat.py b/python/paddle/tensor/compat.py
new file mode 100644
index 00000000000000..e734023a11d96b
--- /dev/null
+++ b/python/paddle/tensor/compat.py
@@ -0,0 +1,571 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, NamedTuple
+
+import paddle
+from paddle import _C_ops
+
+from ..base.framework import Variable
+from ..framework import (
+    in_dynamic_mode,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from paddle import Tensor
+
+from paddle.utils.decorator_utils import ForbidKeywordsDecorator
+
+__all__ = []
+
+
+@ForbidKeywordsDecorator(
+    illegal_keys=["x", "num_or_sections", "axis", "name"],
+    func_name="paddle.compat.split",
+    correct_name="paddle.split",
+)
+def split(
+    tensor: Tensor, split_size_or_sections: int | Sequence[int], dim: int = 0
+) -> tuple[Tensor, ...]:
+    """
+    (PyTorch Compatible API) Split the input tensor into multiple sub-Tensors.
+
+    Args:
+        tensor (Tensor): A N-D Tensor. The data type is bool, bfloat16, float16, float32, float64, uint8, int8, int32 or int64.
+        split_size_or_sections (int|list|tuple):
+            If split_size_or_sections is an integer type, then tensor will be split into equally sized chunks (if possible).
+            Last chunk will be smaller if the tensor size along the given dimension dim is not divisible by split_size.
+            If split_size_or_sections is a list, then tensor will be split into len(split_size_or_sections) chunks with sizes
+            in dim according to split_size_or_sections. Negative inputs are not allowed. For example: for a dim with 9 channels,
+            [2, 3, -1] will not be interpreted as [2, 3, 4], but will be rejected and an exception will be thrown.
+        dim (int|Tensor, optional): The dim along which to split, it can be a integer or a ``0-D Tensor``
+            with shape [] and data type  ``int32`` or ``int64``.
+            If :math::`dim < 0`, the dim to split along is :math:`rank(x) + dim`. Default is 0.
+    Returns:
+        tuple(Tensor), The tuple of segmented Tensors.
+
+    Note:
+        This is a pytorch compatible API that follows the function signature and behavior of torch.split.
+        To use the original split of paddle, please consider `paddle.split`
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+
+            >>> # x is a Tensor of shape [3, 8, 5]
+            >>> x = paddle.rand([3, 8, 5])
+
+            >>> out0, out1, out2 = paddle.compat.split(x, split_size_or_sections=3, dim=1)
+            >>> print(out0.shape)
+            [3, 3, 5]
+            >>> print(out1.shape)
+            [3, 3, 5]
+            >>> print(out2.shape)
+            [3, 2, 5]
+
+            >>> out0, out1, out2 = paddle.compat.split(x, split_size_or_sections=[1, 2, 5], dim=1)
+            >>> print(out0.shape)
+            [3, 1, 5]
+            >>> print(out1.shape)
+            [3, 2, 5]
+            >>> print(out2.shape)
+            [3, 5, 5]
+
+            >>> # dim is negative, the real dim is (rank(x) + dim)=1
+            >>> out0, out1, out2 = paddle.compat.split(x, split_size_or_sections=3, dim=-2)
+            >>> print(out0.shape)
+            [3, 3, 5]
+            >>> print(out1.shape)
+            [3, 3, 5]
+            >>> print(out2.shape)
+            [3, 2, 5]
+    """
+
+    def GetSplitSize(split_size, shape_on_dim):
+        remaining_num = shape_on_dim % split_size_or_sections
+        num_complete_section = shape_on_dim // split_size_or_sections
+        if remaining_num == 0:
+            return num_complete_section
+        else:
+            sections = [
+                split_size_or_sections for _ in range(num_complete_section)
+            ]
+            sections.append(remaining_num)
+            return sections
+
+    def GetShapeOnDimInRange(shape, dim: int) -> int:
+        shape_range = len(shape)
+        if isinstance(dim, int):
+            if dim < -shape_range or dim >= shape_range:
+                raise ValueError(
+                    f"(InvalidArgument) The dim is expected to be in range of [-{shape_range}, {shape_range}), but got {dim}"
+                )
+        return shape[dim]
+
+    if isinstance(split_size_or_sections, (list, tuple)):
+        for i, section_size in enumerate(split_size_or_sections):
+            shape_val = 0
+            if isinstance(section_size, Variable):
+                shape_val = int(section_size.item(0))
+            else:
+                shape_val = section_size
+            if section_size < 0:
+                raise ValueError(
+                    f"paddle.compat.split expects split_sizes have only non-negative entries, but got size = {section_size} on dim {i}"
+                )
+
+    if in_dynamic_mode():
+        if isinstance(dim, Variable):
+            dim = dim.item(0)
+        assert dim + len(tensor.shape) >= 0, "(rank(x) + dim) must >= 0"
+        dim = (dim + len(tensor.shape)) if dim < 0 else dim
+
+        if isinstance(split_size_or_sections, (list, tuple)):
+            if paddle.utils._contain_var(split_size_or_sections):
+                for index, item in enumerate(split_size_or_sections):
+                    if isinstance(item, Variable):
+                        split_size_or_sections[index] = split_size_or_sections[
+                            index
+                        ].item()
+        elif not isinstance(split_size_or_sections, int):
+            raise TypeError(
+                "The type of 'split_size_or_sections' in split must be int, list or tuple in imperative mode, but "
+                f"received {type(split_size_or_sections)}."
+            )
+
+        if isinstance(split_size_or_sections, int):
+            # check whether shape is divisible
+            assert (
+                split_size_or_sections > 0
+            ), 'split_size_or_sections must be greater than 0.'
+
+            split_size_or_sections = GetSplitSize(
+                split_size_or_sections, GetShapeOnDimInRange(tensor.shape, dim)
+            )
+
+            if isinstance(split_size_or_sections, list):
+                return tuple(_C_ops.split(tensor, split_size_or_sections, dim))
+            else:
+                return tuple(
+                    _C_ops.split_with_num(tensor, split_size_or_sections, dim)
+                )
+        else:
+            return tuple(_C_ops.split(tensor, split_size_or_sections, dim))
+    else:
+        if isinstance(dim, paddle.pir.Value):
+            raise TypeError(
+                "'dim' is not allowed to be a pir.Value in a static graph: "
+                "\npir.Value can not be used for indexing python lists/tuples."
+            )
+        if isinstance(dim, int):
+            assert len(tensor.shape) + dim >= 0, "(rank(x) + dim) must >= 0"
+            dim = (len(tensor.shape) + dim) if dim < 0 else dim
+
+        input_shape = tensor.shape
+
+        if not isinstance(split_size_or_sections, (int, list, tuple)):
+            raise TypeError(
+                "The type of 'split_size_or_sections' in split must be int, list or tuple in imperative mode."
+            )
+        if isinstance(split_size_or_sections, int):
+            assert (
+                split_size_or_sections > 0
+            ), 'split_size_or_sections must be greater than 0.'
+
+            split_size_or_sections = GetSplitSize(
+                split_size_or_sections, GetShapeOnDimInRange(tensor.shape, dim)
+            )
+            if isinstance(split_size_or_sections, list):
+                if paddle.utils._contain_var(split_size_or_sections):
+                    split_size_or_sections = paddle.utils.get_int_tensor_list(
+                        split_size_or_sections
+                    )
+                return tuple(_C_ops.split(tensor, split_size_or_sections, dim))
+            else:
+                return tuple(
+                    _C_ops.split_with_num(tensor, split_size_or_sections, dim)
+                )
+        else:
+            if isinstance(dim, int) and input_shape[dim] > 0:
+                assert (
+                    len(split_size_or_sections) <= input_shape[dim]
+                ), 'len(split_size_or_sections) must not be more than input.shape[dim].'
+            if paddle.utils._contain_var(split_size_or_sections):
+                split_size_or_sections = paddle.utils.get_int_tensor_list(
+                    split_size_or_sections
+                )
+            return tuple(_C_ops.split(tensor, split_size_or_sections, dim))
+
+
+class MinMaxRetType(NamedTuple):
+    values: Tensor
+    indices: Tensor
+
+
+def _min_max_param_checker(func_name: str, *args: Any, **kwargs: Any):
+    def invalid_arguments_exception(error_prefix=""):
+        type_strs = [type(v).__name__ for v in args]
+        type_strs.extend([f"{k}={type(v).__name__}" for k, v in kwargs.items()])
+        signature = ", ".join(type_strs)
+
+        error_msg = (
+            f"Invalid arguments for `paddle.compat.{func_name}`:\n{error_prefix}"
+            f"Got: (paddle.Tensor input, {signature}), but expect one of:\n"
+            f" - (input: paddle.Tensor) for reduce_{func_name} on all dims.\n"
+            f" - (input: paddle.Tensor, other: paddle.Tensor) -> see paddle.{func_name}imum\n"
+            f" - (input: paddle.Tensor, int dim (cannot be None), bool keepdim = False)\n"
+        )
+        return TypeError(error_msg)
+
+    def try_get_keys(key):
+        res = None
+        try:
+            res = kwargs[key]
+        except KeyError:
+            raise invalid_arguments_exception() from None
+        return res
+        found_key = None
+
+    dim_or_other = None
+    keepdim = False
+
+    num_args = len(args)
+    total_arg_num = num_args + len(kwargs)
+    if total_arg_num > 2:
+        raise invalid_arguments_exception()
+    elif total_arg_num == 2:
+        if num_args == 2:
+            dim_or_other, keepdim = args
+            if dim_or_other is None or isinstance(
+                dim_or_other, (Variable, paddle.pir.Value)
+            ):
+                raise invalid_arguments_exception()
+        elif num_args == 1:
+            dim_or_other = args[0]
+            if dim_or_other is None or isinstance(
+                dim_or_other, (Variable, paddle.pir.Value)
+            ):
+                raise invalid_arguments_exception()
+            keepdim = try_get_keys("keepdim")
+        else:
+            dim_or_other = try_get_keys("dim")
+            keepdim = try_get_keys("keepdim")
+    elif total_arg_num == 1:
+        if num_args:
+            dim_or_other = args[0]
+            if dim_or_other is None:
+                raise invalid_arguments_exception()
+        else:
+            if "dim" in kwargs:
+                dim_or_other = kwargs["dim"]
+            elif "other" in kwargs:
+                dim_or_other = kwargs["other"]
+                if not isinstance(dim_or_other, (Variable, paddle.pir.Value)):
+                    raise invalid_arguments_exception()
+            if dim_or_other is None:
+                raise invalid_arguments_exception()
+
+    if (
+        dim_or_other is not None
+        and not isinstance(dim_or_other, (Variable, paddle.pir.Value))
+        and type(dim_or_other) is not int
+    ):
+        raise invalid_arguments_exception(
+            f"The second input must be int or Tensor or implicit None in compat.{func_name}, but received {type(dim_or_other)}.\n"
+        )
+
+    return dim_or_other, keepdim
+
+
+def _min_max_tensor_allow_grad(input: Tensor):
+    """Prevent integral input tensor type to have `stop_gradient=False`"""
+    in_dtype = input.dtype
+    if (
+        in_dtype == paddle.int32
+        or in_dtype == paddle.int64
+        or in_dtype == paddle.uint8
+        or in_dtype == paddle.int16
+    ):
+        if not input.stop_gradient:
+            raise TypeError(
+                f"Tensors with integral type: '{in_dtype}' should stop gradient."
+            )
+
+
+def _min_max_allow_cpu_composite(input: Tensor):
+    """paddle.min/argmin(max/argmax), paddle.take_along_axis reject the following types"""
+    in_dtype = input.dtype
+    if (
+        in_dtype == paddle.float16
+        or in_dtype == paddle.bfloat16
+        or in_dtype == paddle.int16
+    ):
+        raise TypeError(
+            f"Non-CUDA GPU placed Tensor does not have '{in_dtype}' op registered.\n"
+            "Paddle support following DataTypes: int32, int64, float64, float32, uint8"
+        )
+
+
+@ForbidKeywordsDecorator(
+    illegal_keys=['x', 'axis'],
+    func_name="paddle.compat.min",
+    correct_name='paddle.min',
+)
+def min(input: Tensor, *args: Any, **kwargs: Any) -> Tensor | MinMaxRetType:
+    """
+
+    Computes the minimum of tensor elements. There are mainly 3 cases (functionalities):
+    1. paddle.compat.min(input: Tensor): reduce min over all dims, return a single value Tensor
+    2. paddle.compat.min(input: Tensor, dim: int (cannot be None), keepdim=False): reduce min over the given dim,
+        returns a named tuple MinMaxRetType(values: Tensor, indices: Tensor)
+    3. paddle.compat.min(input: Tensor, other: Tensor): see `paddle.minimum`
+
+    Special warning: the gradient behavior is NOT well-documented by PyTorch, the actual behavior should be:
+    1. Case 1: the same as `amin`
+    2. Case 2: NOT evenly distributing the gradient for equal minimum elements! PyTorch actually only propagates to the elements with indices,
+        for example: Tensor([1, 1, 1]) -> min(..., dim=0) -> values=Tensor(0, ...), indices=Tensor(0), the gradient for input tensor won't be
+        Tensor([1/3, 1/3, 1/3]) as stated in their documentation, but will be Tensor([1, 0, 0]). This API implements a similar backward kernel.
+    3. Case 3: the same as `minimum`
+
+    Args:
+        input (Tensor): A tensor, the data type is bfloat16, float16, float32, float64, int32, int64 on GPU.
+            uint8, int32, int64, float32, float64 are allowed on CPU.
+        dim (int, optional): The dim along which the minimum is computed.
+            If this is not specified: see case 1, note that: `None` cannot be passed to this (TypeError will be thrown)
+            compute the minimum over all elements of `input` and return a Tensor with a single element,
+            otherwise must be in the range :math:`[-input.ndim, input.ndim)`.
+            If :math:`dim < 0`, the axis to reduce is :math:`input.ndim + dim`.
+            Warning: if `dim` is specified, execute static graph will throw exceptions
+            when not on a GPU device, since max_with_index is not implemented for non-GPU devices
+        keepdim (bool, optional): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the `input` unless :attr:`keepdim` is true, default
+            value is False. Note that if `dim` does not appear in neither (*args) or (**kwargs), this parameter cannot be passed alone
+        other (Tensor, optional): the other tensor to perform `paddle.minimum` with. This Tensor should
+            have the same or broadcast-able shape as the `input`. Note that (`dim` & `keepdim`) and `other` are mutually exclusive
+            meaning that trying to composite both will result in TypeError
+
+    Returns:
+        - For case 1: a single value Tensor (0-dim)
+        - For case 2: a named tuple MinMaxRetType(values: Tensor, indices: Tensor), `values` has the same data type as the `input`,
+            while indices is always an int64 Tensor, with exactly the same shape as `values`.
+            MinMaxRetType can be used (indexed, packed, unpacked) in the same way as a regular tuple
+        - For case 3: see `paddle.minimum`
+
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+
+            >>> # data_x is a Tensor with shape [2, 4]
+            >>> # the axis is a int element
+            >>> x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
+            ...                       [0.1, 0.2, 0.6, 0.7]],
+            ...                       dtype='float64', stop_gradient=False)
+            >>> # Case 1: reduce over all dims
+            >>> result1 = paddle.compat.min(x)
+            >>> result1
+            Tensor(shape=[], dtype=float64, place=Place(gpu:0), stop_gradient=False,
+            0.10000000)
+
+            >>> # Case 2: reduce over specified dim
+            >>> x.clear_grad()
+            >>> result2 = paddle.compat.min(x, dim=1)
+            >>> result2
+            MinMaxRetType(values=Tensor(shape=[2], dtype=float64, place=Place(gpu:0), stop_gradient=False,
+                [0.20000000, 0.10000000]), indices=Tensor(shape=[2], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+                [0, 0]))
+            >>> result2[0].backward()
+            >>> x.grad
+            Tensor(shape=[2, 4], dtype=float64, place=Place(gpu:0), stop_gradient=False,
+                [[1., 0., 0., 0.],
+                 [1., 0., 0., 0.]])
+
+            >>> # Case 3: equivalent to `paddle.minimum`
+            >>> x.clear_grad()
+            >>> y = paddle.to_tensor([[0.5, 0.4, 0.1, 0.2],
+            ...                       [0.3, 0.1, 0.6, 0.7]],
+            ...                       dtype='float64', stop_gradient=False)
+            >>> result3 = paddle.compat.min(x, y)
+            >>> result3
+            Tensor(shape=[2, 4], dtype=float64, place=Place(gpu:0), stop_gradient=False,
+                [[0.20000000, 0.30000000, 0.10000000, 0.20000000],
+                 [0.10000000, 0.10000000, 0.60000000, 0.70000000]])
+    """
+    if not isinstance(input, paddle.pir.Value) and not isinstance(
+        input, paddle.Tensor
+    ):
+        raise TypeError(
+            f"input should be a tensor, but got an instance with type '{type(input).__name__}'"
+        )
+    _min_max_tensor_allow_grad(input)
+
+    dim_or_other, keepdim = _min_max_param_checker("min", *args, **kwargs)
+
+    if dim_or_other is None:
+        if input.numel() == 0:
+            raise ValueError(
+                "Reduce max cannot apply on empty tensor (numel == 0)"
+            )
+        return paddle.amin(input)
+    elif isinstance(dim_or_other, int):
+        if in_dynamic_mode() and not input.place.is_gpu_place():
+            _min_max_allow_cpu_composite(input)
+            # CPUPlace and other placements are implemented by composition
+            indices = paddle.argmin(input, axis=dim_or_other, keepdim=True)
+            values = paddle.take_along_axis(input, indices, axis=dim_or_other)
+            if keepdim:
+                return MinMaxRetType(values=values, indices=indices)
+            return MinMaxRetType(
+                values=values.squeeze_(axis=dim_or_other),
+                indices=indices.squeeze_(axis=dim_or_other),
+            )
+        else:
+            vals, inds = _C_ops.min_with_index(
+                input, dim_or_other, keepdim, False
+            )
+            inds.stop_gradient = True
+            return MinMaxRetType(values=vals, indices=inds)
+    else:
+        return _C_ops.minimum(input, dim_or_other)
+
+
+@ForbidKeywordsDecorator(
+    illegal_keys=['x', 'axis'],
+    func_name="paddle.compat.max",
+    correct_name='paddle.max',
+)
+def max(input: Tensor, *args: Any, **kwargs: Any) -> Tensor | MinMaxRetType:
+    """
+
+    Computes the maximum of tensor elements. There are mainly 3 cases (functionalities):
+    1. paddle.compat.max(input: Tensor): reduce max over all dims, return a single value Tensor
+    2. paddle.compat.max(input: Tensor, dim: int (cannot be None), keepdim=False): reduce max over the given dim,
+        returns a named tuple MinMaxRetType(values: Tensor, indices: Tensor)
+    3. paddle.compat.max(input: Tensor, other: Tensor): see `paddle.maximum`
+
+    Special warning: the gradient behavior is NOT well-documented by PyTorch, the actual behavior should be:
+    1. Case 1: the same as `amax`
+    2. Case 2: NOT evenly distributing the gradient for equal maximum elements! PyTorch actually only propagates to the elements with indices,
+        for example: Tensor([1, 1, 1]) -> max(..., dim=0) -> values=Tensor(0, ...), indices=Tensor(0), the gradient for input tensor won't be
+        Tensor([1/3, 1/3, 1/3]) as stated in their documentation, but will be Tensor([1, 0, 0]). This API implements a similar backward kernel.
+    3. Case 3: the same as `maximum`
+
+    Args:
+        input (Tensor): A tensor, the data type is bfloat16, float16, float32, float64, int32, int64 on GPU.
+            uint8, int32, int64, float32, float64 are allowed on CPU.
+        dim (int, optional): The dim along which the maximum is computed.
+            If this is not specified: see case 1, note that: `None` cannot be passed to this (TypeError will be thrown)
+            compute the maximum over all elements of `input` and return a Tensor with a single element,
+            otherwise must be in the range :math:`[-input.ndim, input.ndim)`.
+            If :math:`dim < 0`, the axis to reduce is :math:`input.ndim + dim`.
+            Warning: if `dim` is specified, execute static graph will throw exceptions
+            when not on a GPU device, since max_with_index is not implemented for non-GPU devices
+        keepdim (bool, optional): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the `input` unless :attr:`keepdim` is true, default
+            value is False. Note that if `dim` does not appear in neither (*args) or (**kwargs), this parameter cannot be passed alone
+        other (Tensor, optional): the other tensor to perform `paddle.maximum` with. This Tensor should
+            have the same or broadcast-able shape as the `input`. Note that (`dim` & `keepdim`) and `other` are mutually exclusive
+            meaning that trying to composite both will result in TypeError
+
+    Returns:
+        - For case 1: a single value Tensor (0-dim)
+        - For case 2: a named tuple MinMaxRetType(values: Tensor, indices: Tensor), `values` has the same data type as the `input`,
+            while indices is always an int64 Tensor, with exactly the same shape as `values`.
+            MinMaxRetType can be used (indexed, packed, unpacked) in the same way as a regular tuple
+        - For case 3: see `paddle.maximum`
+
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+
+            >>> # data_x is a Tensor with shape [2, 4]
+            >>> # the axis is a int element
+            >>> x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
+            ...                       [0.1, 0.2, 0.6, 0.7]],
+            ...                       dtype='float64', stop_gradient=False)
+            >>> # Case 1: reduce over all dims
+            >>> result1 = paddle.compat.max(x)
+            >>> result1
+            Tensor(shape=[], dtype=float64, place=Place(gpu:0), stop_gradient=False,
+            0.90000000)
+
+            >>> # Case 2: reduce over specified dim
+            >>> x.clear_grad()
+            >>> result2 = paddle.compat.max(x, dim=1)
+            >>> result2
+            MinMaxRetType(values=Tensor(shape=[2], dtype=float64, place=Place(gpu:0), stop_gradient=False,
+                [0.90000000, 0.70000000]), indices=Tensor(shape=[2], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+                [3, 3]))
+            >>> result2[0].backward()
+            >>> x.grad
+            Tensor(shape=[2, 4], dtype=float64, place=Place(gpu:0), stop_gradient=False,
+                [[0., 0., 0., 1.],
+                 [0., 0., 0., 1.]])
+
+            >>> # Case 3: equivalent to `paddle.maximum`
+            >>> x.clear_grad()
+            >>> y = paddle.to_tensor([[0.5, 0.4, 0.1, 0.2],
+            ...                       [0.3, 0.1, 0.6, 0.7]],
+            ...                       dtype='float64', stop_gradient=False)
+            >>> result3 = paddle.compat.max(x, y)
+            >>> result3
+            Tensor(shape=[2, 4], dtype=float64, place=Place(gpu:0), stop_gradient=False,
+                [[0.50000000, 0.40000000, 0.50000000, 0.90000000],
+                 [0.30000000, 0.20000000, 0.60000000, 0.70000000]])
+    """
+    if not isinstance(input, paddle.pir.Value) and not isinstance(
+        input, paddle.Tensor
+    ):
+        raise TypeError(
+            f"input should be a tensor, but got an instance with type '{type(input).__name__}'"
+        )
+    _min_max_tensor_allow_grad(input)
+
+    dim_or_other, keepdim = _min_max_param_checker("max", *args, **kwargs)
+
+    if dim_or_other is None:
+        if input.numel() == 0:
+            raise ValueError(
+                "Reduce max cannot apply on empty tensor (numel == 0)"
+            )
+        return paddle.amax(input)
+    elif isinstance(dim_or_other, int):
+        if in_dynamic_mode() and not input.place.is_gpu_place():
+            _min_max_allow_cpu_composite(input)
+            indices = paddle.argmax(input, axis=dim_or_other, keepdim=True)
+            values = paddle.take_along_axis(input, indices, axis=dim_or_other)
+            if keepdim:
+                return MinMaxRetType(values=values, indices=indices)
+            return MinMaxRetType(
+                values=values.squeeze_(axis=dim_or_other),
+                indices=indices.squeeze_(axis=dim_or_other),
+            )
+        else:
+            vals, inds = _C_ops.max_with_index(
+                input, dim_or_other, keepdim, False
+            )
+            inds.stop_gradient = True
+            return MinMaxRetType(values=vals, indices=inds)
+    else:
+        return _C_ops.maximum(input, dim_or_other)
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index cb9b300b6d624f..55432ea9adcbaa 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -24,7 +24,7 @@
 
 import paddle
 from paddle import _C_ops
-from paddle.utils.decorator_utils import ParamAliasDecorator
+from paddle.utils.decorator_utils import ParamAliasDecorator, SizeArgsDecorator
 from paddle.utils.inplace_utils import inplace_apis_in_dygraph_only
 
 from ..base.data_feeder import (
@@ -1241,6 +1241,7 @@ def fill_constant(
         return out
 
 
+@SizeArgsDecorator()
 def ones(
     shape: ShapeLike, dtype: DTypeLike | None = None, name: str | None = None
 ) -> paddle.Tensor:
@@ -3032,7 +3033,7 @@ def _memcpy(input, place=None, output=None) -> paddle.Tensor:
 
 
 def complex(
-    real: paddle.Tensor, imag: paddle.Tensor, name: str | None = None
+    real: paddle.Tensor, imag: paddle.Tensor, out=None, name: str | None = None
 ) -> paddle.Tensor:
     """Return a complex tensor given the real and image component.
 
@@ -3040,6 +3041,7 @@ def complex(
         real (Tensor): The real component. The data type should be 'float32' or 'float64'.
         imag (Tensor): The image component. The data type should be the same as ``real``.
         name(str|None, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+        out (Tensor|None, optional): The output tensor. Default: None.
 
     Returns:
         Tensor, The output tensor. The data type is 'complex64' or 'complex128', with the same precision as ``real`` and ``imag``.
@@ -3062,7 +3064,7 @@ def complex(
              [(1+0j), (1+1j), (1+2j)]])
     """
     if in_dynamic_or_pir_mode():
-        return _C_ops.complex(real, imag)
+        return _C_ops.complex(real, imag, out=out)
     else:
         check_variable_and_dtype(
             real, 'real', ['float32', 'float64'], 'complex'
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 857554b5dd1f2a..2014603dff6ca6 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -58,6 +58,8 @@
         TensorOrTensors,
     )
 
+from paddle.utils.decorator_utils import ForbidKeywordsDecorator
+
 __all__ = []
 
 
@@ -2723,6 +2725,11 @@ def row_stack(x: Sequence[Tensor], name: str | None = None) -> Tensor:
     return paddle.vstack(x, name=name)
 
 
+@ForbidKeywordsDecorator(
+    illegal_keys=["tensor", "split_size_or_sections", "dim"],
+    func_name="paddle.split",
+    correct_name="paddle.compat.split",
+)
 def split(
     x: Tensor,
     num_or_sections: int | Sequence[int],
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 4c065b4ab43c2b..868c9eb1c10173 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -100,6 +100,8 @@
     from paddle import Tensor
     from paddle._typing import DTypeLike
 
+from paddle.utils.decorator_utils import ForbidKeywordsDecorator
+
 __all__ = []
 
 _supported_int_dtype_ = [
@@ -3131,6 +3133,11 @@ def _check_input(x):
         return out
 
 
+@ForbidKeywordsDecorator(
+    illegal_keys=["input", "dim", "other"],
+    func_name="paddle.max",
+    correct_name="paddle.compat.max",
+)
 def max(
     x: Tensor,
     axis: int | Sequence[int] | None = None,
@@ -3290,6 +3297,11 @@ def max(
             return out
 
 
+@ForbidKeywordsDecorator(
+    illegal_keys=["input", "dim", "other"],
+    func_name="paddle.min",
+    correct_name="paddle.compat.min",
+)
 def min(
     x: Tensor,
     axis: int | Sequence[int] | None = None,
diff --git a/python/paddle/utils/decorator_utils.py b/python/paddle/utils/decorator_utils.py
index 79ec73937ec8c1..97d1f4da603517 100644
--- a/python/paddle/utils/decorator_utils.py
+++ b/python/paddle/utils/decorator_utils.py
@@ -89,3 +89,56 @@ def process(
                             f"Cannot specify both '{original}' and its alias '{alias}'"
                         )
         return args, processed_kwargs
+
+
+# *size => shape decorator
+class SizeArgsDecorator(DecoratorBase):
+    """
+    Usage Example:
+
+    paddle.ones(1, dtype=paddle.float32)
+    paddle.ones(1, 2, 3, dtype=paddle.float32)
+    paddle.ones([1, 2, 3], dtype=paddle.float32)
+    paddle.ones(size=[1, 2, 3], dtype=paddle.float32)
+
+    paddle.ones([1, 2, 3], paddle.float32)
+    paddle.ones(shape=[1, 2, 3], dtype=paddle.float32)
+    """
+
+    def process(
+        self, args: tuple[Any, ...], kwargs: dict[str, Any]
+    ) -> tuple[tuple[Any, ...], dict[str, Any]]:
+        if 'size' in kwargs:
+            kwargs['shape'] = kwargs.pop('size')
+        elif len(args) >= 1 and isinstance(args[0], int):
+            kwargs['shape'] = list(args)
+            args = ()
+
+        return args, kwargs
+
+
+class ForbidKeywordsDecorator(DecoratorBase):
+    """A decorator that hints users to use the correct `compat` functions, when erroneous keyword arguments are detected"""
+
+    def __init__(
+        self, illegal_keys: list[str], func_name: str, correct_name: str
+    ) -> None:
+        super().__init__()
+        self.illegal_keys = illegal_keys
+        self.func_name = func_name
+        self.correct_name = correct_name
+
+    def process(
+        self, args: tuple[Any, ...], kwargs: dict[str, Any]
+    ) -> tuple[tuple[Any, ...], dict[str, Any]]:
+        found_keys = [key for key in self.illegal_keys if key in kwargs]
+
+        if found_keys:
+            keys_str = ", ".join(f"'{key}'" for key in found_keys)
+            plural = "s" if len(found_keys) > 1 else ""
+
+            raise TypeError(
+                f"{self.func_name}() received unexpected keyword argument{plural} {keys_str}. "
+                f"\nDid you mean to use {self.correct_name}() instead?"
+            )
+        return args, kwargs
diff --git a/test/auto_parallel/PP_Schedules_demo.py b/test/auto_parallel/PP_Schedules_demo.py
index 6ac055410fbf0a..be8963356d0661 100644
--- a/test/auto_parallel/PP_Schedules_demo.py
+++ b/test/auto_parallel/PP_Schedules_demo.py
@@ -414,6 +414,67 @@ def test_dp_pp(self):
             opt.clear_grad()
         return losses_by_step, all_losses_in_one_step_md5sum
 
+    def test_pp_model_with_ClipGradByGlobalNorm(self):
+        """Test pipeline parallel model with ClipGradByGlobalNorm using PPMyModel as the baseline"""
+        fix_seeds()
+        pp_model = PPMyModel()
+        opt = paddle.optimizer.AdamW(
+            learning_rate=0.001,
+            parameters=pp_model.parameters(),
+            grad_clip=paddle.nn.ClipGradByGlobalNorm(1.0),
+        )
+        loss_fn = nn.MSELoss()
+        dataset = RandomDataset(image_size=8, output_size=8, num_samples=8)
+        loader = DataLoader(dataset, batch_size=1)
+        pp_losses_step = []
+        num_iterations = 20
+
+        for iter_idx in range(num_iterations):
+            pp_losses_micro_batch = []
+            for i, (data, label) in enumerate(loader):
+                output = pp_model(data)
+                loss = loss_fn(output, label)
+                pp_losses_micro_batch.append(loss.item())
+                loss.backward()
+            pp_losses_step.append(
+                np.array(pp_losses_micro_batch, dtype=np.float32).mean()
+            )
+            opt.step()
+            opt.clear_grad()
+        return pp_losses_step
+
+    def test_ScheduleFThenB_with_ClipGradByGlobalNorm(self):
+        fix_seeds()
+        self.model = PPMyModel_SingleStage()
+        self.micro_batches = 8
+        self.stage = PipelineStage(self.model, self.rank, 4, group=self.group)
+        self.stage.has_backward = True
+        loss_fn_ = nn.MSELoss()
+        schedule = ScheduleFThenB(
+            self.stage, self.micro_batches, loss_fn=loss_fn_
+        )
+        opt = paddle.optimizer.AdamW(
+            learning_rate=0.001,
+            parameters=self.model.parameters(),
+            grad_clip=paddle.nn.ClipGradByGlobalNorm(1.0),
+        )
+        dataset = RandomDataset(image_size=8, output_size=8, num_samples=8)
+        loader = DataLoader(dataset, batch_size=8)
+        losses_by_step = []
+        num_iterations = 20
+
+        for iter_idx in range(num_iterations):
+            losses_by_micro_batch = []
+            for i, (data, label) in enumerate(loader):
+                schedule.step(data, target=label, losses=losses_by_micro_batch)
+                if self.rank == 3:
+                    losses_by_step.append(
+                        np.array(losses_by_micro_batch, dtype=np.float32).mean()
+                    )
+            opt.step()
+            opt.clear_grad()
+        return losses_by_step
+
     def test_dp_pp_align_mode(self):
         fix_seeds()
         paddle.set_flags({'FLAGS_enable_auto_parallel_align_mode': True})
@@ -490,6 +551,12 @@ def run_test(self):
         scheduleFThenB_losses = self.test_ScheduleFThenB()
         schedule1f1b_losses = self.test_Schedule1F1B()
         schedulevpp_losses = self.test_ScheduleVPP()
+        pp_model_with_ClipGradByGlobalNorm_losses = (
+            self.test_pp_model_with_ClipGradByGlobalNorm()
+        )
+        scheduleFThenB_with_ClipGradByGlobalNorm_losses = (
+            self.test_ScheduleFThenB_with_ClipGradByGlobalNorm()
+        )
         dp_pp_losses, dp_pp_losses_md5sum = self.test_dp_pp()
         dp_pp_align_mode_losses, dp_pp_align_mode_losses_md5sum = (
             self.test_dp_pp_align_mode()
@@ -520,6 +587,12 @@ def run_test(self):
                 rtol=1e-5,
             )
 
+            np.testing.assert_allclose(
+                pp_model_with_ClipGradByGlobalNorm_losses,
+                scheduleFThenB_with_ClipGradByGlobalNorm_losses,
+                rtol=1e-5,
+            )
+
             np.testing.assert_allclose(
                 dp_pp_align_mode_losses,
                 dp_pp_losses,
diff --git a/test/cpp/cinn/common/integer_set_test.cc b/test/cpp/cinn/common/integer_set_test.cc
index 6d57f2dd0ed257..3f7afd4bcae50d 100644
--- a/test/cpp/cinn/common/integer_set_test.cc
+++ b/test/cpp/cinn/common/integer_set_test.cc
@@ -24,11 +24,13 @@ namespace common {
 class TestSymbolicExprAnalyzer : public ::testing::Test {
  public:
   void SetUp() override {
-    i = ir::Var(ir::Expr(0), ir::Expr(7), "i");
-    j = ir::Var(ir::Expr(0), ir::Expr(15), "j");
+    // Var is [lower_bound, upper_bound)
+    i = ir::Var(ir::Expr(0), ir::Expr(7), "i");   // i ∈ [0, 7)
+    j = ir::Var(ir::Expr(0), ir::Expr(15), "j");  // j ∈ [0, 15)
+    // CasInterval is [lower_bound, upper_bound]
     var_intervals = {
-        {"i", CasInterval(i->lower_bound, i->upper_bound)},
-        {"j", CasInterval(j->lower_bound, j->upper_bound)},
+        {"i", CasInterval(i->lower_bound, i->upper_bound - 1)},  // i ∈ [0, 6]
+        {"j", CasInterval(j->lower_bound, j->upper_bound - 1)},  // j ∈ [0, 14]
     };
   }
 
@@ -41,35 +43,35 @@ class TestSymbolicExprAnalyzer : public ::testing::Test {
 TEST_F(TestSymbolicExprAnalyzer, bound) {
   ir::Expr e1 = i + j;
   EXPECT_EQ(analyzer.LowerBound(e1), ir::Expr(0));
-  EXPECT_EQ(analyzer.UpperBound(e1), ir::Expr(22));
+  EXPECT_EQ(analyzer.UpperBound(e1), ir::Expr(20));  // 6 + 14 = 20
 
   ir::Expr e2 = 16 * i + j;
   EXPECT_EQ(analyzer.LowerBound(e2), ir::Expr(0));
-  EXPECT_EQ(analyzer.UpperBound(e2), ir::Expr(127));
+  EXPECT_EQ(analyzer.UpperBound(e2), ir::Expr(110));  // 16 * 6 + 14 = 110
 
   ir::Expr e3 = 16 * i + j + 1;
   EXPECT_EQ(analyzer.LowerBound(e3), ir::Expr(1));
-  EXPECT_EQ(analyzer.UpperBound(e3), ir::Expr(128));
+  EXPECT_EQ(analyzer.UpperBound(e3), ir::Expr(111));  // 16 * 6 + 15 = 111
 
   ir::Expr e4 = (16 * i + j) / 16;
   EXPECT_EQ(analyzer.LowerBound(e4), ir::Expr(0));
-  EXPECT_EQ(analyzer.UpperBound(e4), ir::Expr(7));
+  EXPECT_EQ(analyzer.UpperBound(e4), ir::Expr(6));  // 110 / 16 = 6
 
   ir::Expr e5 = (16 * i + j) % 16;
   EXPECT_EQ(analyzer.LowerBound(e5), ir::Expr(0));
-  EXPECT_EQ(analyzer.UpperBound(e5), ir::Expr(15));
+  EXPECT_EQ(analyzer.UpperBound(e5), ir::Expr(14));  // 110 % 16
 
   ir::Expr e6 = i - j;
-  EXPECT_EQ(analyzer.LowerBound(e6), ir::Expr(-15));
-  EXPECT_EQ(analyzer.UpperBound(e6), ir::Expr(7));
+  EXPECT_EQ(analyzer.LowerBound(e6), ir::Expr(-14));  // 0 - 14
+  EXPECT_EQ(analyzer.UpperBound(e6), ir::Expr(6));    // 6 - 0
 
   ir::Expr e7 = 0 - i - j;
-  EXPECT_EQ(analyzer.LowerBound(e7), ir::Expr(-22));
-  EXPECT_EQ(analyzer.UpperBound(e7), ir::Expr(0));
+  EXPECT_EQ(analyzer.LowerBound(e7), ir::Expr(-20));  // 0 - 6 - 14
+  EXPECT_EQ(analyzer.UpperBound(e7), ir::Expr(0));    // 0 - 0 - 0
 
   ir::Expr e8 = -1 * i - j;
-  EXPECT_EQ(analyzer.LowerBound(e8), ir::Expr(-22));
-  EXPECT_EQ(analyzer.UpperBound(e8), ir::Expr(0));
+  EXPECT_EQ(analyzer.LowerBound(e8), ir::Expr(-20));  // -1 * 6 - 14
+  EXPECT_EQ(analyzer.UpperBound(e8), ir::Expr(0));    // -1 * 0 - 0
 }
 
 TEST_F(TestSymbolicExprAnalyzer, compare) {
@@ -142,9 +144,9 @@ TEST_F(TestSymbolicExprAnalyzer, Divisible) {
   auto S = ir::Var(ir::Expr(16), ir::Expr(256), "S");
 
   cas_intervals_t divisible_var_intervals = {
-      {"x", CasInterval(x->lower_bound, x->upper_bound)},
-      {"y", CasInterval(y->lower_bound, y->upper_bound)},
-      {"S", CasInterval(S->lower_bound, S->upper_bound)},
+      {"x", CasInterval(x->lower_bound, x->upper_bound - ir::Expr(1))},
+      {"y", CasInterval(y->lower_bound, y->upper_bound - ir::Expr(1))},
+      {"S", CasInterval(S->lower_bound, S->upper_bound - ir::Expr(1))},
   };
   SymbolicExprAnalyzer divisible_analyzer{divisible_var_intervals};
 
@@ -323,11 +325,11 @@ TEST(SingleIntervalIntSet, case_1) {
 }
 
 TEST(SingleIntervalIntSet, case_2) {
-  ir::Var S = ir::Var(ir::Expr(0), ir::Expr(0), "S");
+  ir::Var S = ir::Var(ir::Expr(0), ir::Expr(1), "S");  // S ∈ [0, 1)
 
-  SingleIntervalIntSet set_0{S, S + Expr(1)};
-  SingleIntervalIntSet set_1{Expr(0), Expr(1)};
-  SingleIntervalIntSet set_2{Expr(0), Expr(2)};
+  SingleIntervalIntSet set_0{S, S + Expr(1)};    // [0, 1]
+  SingleIntervalIntSet set_1{Expr(0), Expr(1)};  // [0, 1]
+  SingleIntervalIntSet set_2{Expr(0), Expr(2)};  // [0, 2]
 
   EXPECT_TRUE(ProveEQ(set_0, set_1).value());
   EXPECT_FALSE(ProveEQ(set_0, set_2).value());
diff --git a/test/cpp/eager/performance_tests/benchmark_utils.cc b/test/cpp/eager/performance_tests/benchmark_utils.cc
index 7b95d911bc5345..23218075517c4c 100644
--- a/test/cpp/eager/performance_tests/benchmark_utils.cc
+++ b/test/cpp/eager/performance_tests/benchmark_utils.cc
@@ -228,7 +228,7 @@ void benchmark_fluid_scale(const std::shared_ptr<imperative::VarBase>& X,
   imperative::Tracer tracer;
   framework::AttributeMap attrs;
 
-  attrs["use_mkldnn"] = false;
+  attrs["use_onednn"] = false;
   attrs["scale"] = 2;
   attrs["bias"] = 3;
   attrs["bias_after_scale"] = true;
diff --git a/test/cpp/fluid/elementwise/test_elementwise_add_grad_grad.cc b/test/cpp/fluid/elementwise/test_elementwise_add_grad_grad.cc
index 6186cfa2c9756f..ec00557d6a0dd5 100644
--- a/test/cpp/fluid/elementwise/test_elementwise_add_grad_grad.cc
+++ b/test/cpp/fluid/elementwise/test_elementwise_add_grad_grad.cc
@@ -59,7 +59,7 @@ class TestElementwiseAddGradGradWithoutDDX
         this->op_type_,
         {{"Y", {"Y"}}, {"DOut", {"DOut"}}, {"DDY", {"DDY"}}},
         {{"DDOut", {"DDOut"}}},
-        {{"use_mkldnn", false}, {"axis", 0}});
+        {{"use_onednn", false}, {"axis", 0}});
     return op;
   }
 };
diff --git a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
index 8f1ed87888ba44..f4ecb943a8dd9c 100644
--- a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
+++ b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc
@@ -87,7 +87,7 @@ class TestElementwiseDivGradGradWithDout : public TestElementwiseOpGradGrad<T> {
          {"DDY", {"DDY"}},
          {"DX", {"DX"}}},
         {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}, {"DOut", {"DOut"}}},
-        {{"use_mkldnn", false}, {"axis", 0}});
+        {{"use_onednn", false}, {"axis", 0}});
     return op;
   }
 };
diff --git a/test/cpp/fluid/mkldnn/test_conv_onednn_nhwc.cc b/test/cpp/fluid/mkldnn/test_conv_onednn_nhwc.cc
index 28028858c3bac0..49071d5938a744 100644
--- a/test/cpp/fluid/mkldnn/test_conv_onednn_nhwc.cc
+++ b/test/cpp/fluid/mkldnn/test_conv_onednn_nhwc.cc
@@ -66,7 +66,7 @@ TEST(test_conv2d_output, fp32) {
   conv2d_op.SetAttr("paddings", paddings);
   conv2d_op.SetAttr("dilations", dilations);
   conv2d_op.SetAttr("groups", groups);
-  conv2d_op.SetAttr("use_mkldnn", true);
+  conv2d_op.SetAttr("use_onednn", true);
 
   auto op = paddle::framework::OpRegistry::CreateOp(conv2d_op);
 
@@ -95,7 +95,7 @@ TEST(test_conv2d_output, int8) {
   conv2d_op.SetAttr("paddings", paddings);
   conv2d_op.SetAttr("dilations", dilations);
   conv2d_op.SetAttr("groups", groups);
-  conv2d_op.SetAttr("use_mkldnn", true);
+  conv2d_op.SetAttr("use_onednn", true);
   conv2d_op.SetAttr("mkldnn_data_type", std::string("int8"));
   conv2d_op.SetAttr("force_fp32_output", false);
 
@@ -126,7 +126,7 @@ TEST(test_conv2d_output, ic1) {
   conv2d_op.SetAttr("paddings", paddings);
   conv2d_op.SetAttr("dilations", dilations);
   conv2d_op.SetAttr("groups", groups);
-  conv2d_op.SetAttr("use_mkldnn", true);
+  conv2d_op.SetAttr("use_onednn", true);
 
   auto op = paddle::framework::OpRegistry::CreateOp(conv2d_op);
 
@@ -156,7 +156,7 @@ TEST(test_conv2d_output, ic2) {
   conv2d_op.SetAttr("paddings", paddings);
   conv2d_op.SetAttr("dilations", dilations);
   conv2d_op.SetAttr("groups", groups);
-  conv2d_op.SetAttr("use_mkldnn", true);
+  conv2d_op.SetAttr("use_onednn", true);
 
   auto op = paddle::framework::OpRegistry::CreateOp(conv2d_op);
 
@@ -186,7 +186,7 @@ TEST(test_conv2d_output, ic4) {
   conv2d_op.SetAttr("paddings", paddings);
   conv2d_op.SetAttr("dilations", dilations);
   conv2d_op.SetAttr("groups", groups);
-  conv2d_op.SetAttr("use_mkldnn", true);
+  conv2d_op.SetAttr("use_onednn", true);
 
   auto op = paddle::framework::OpRegistry::CreateOp(conv2d_op);
 
diff --git a/test/cpp/fluid/mkldnn/test_onednn_caching.cc b/test/cpp/fluid/mkldnn/test_onednn_caching.cc
index 694d9aeb6e3bc7..d87e1c4145f5b2 100644
--- a/test/cpp/fluid/mkldnn/test_onednn_caching.cc
+++ b/test/cpp/fluid/mkldnn/test_onednn_caching.cc
@@ -115,12 +115,12 @@ void RunOperator(const phi::Place &place,
                       {{first_input_var_name, {first_input}},
                        {second_input_var_name, {"x1"}}},
                       {{output_var_name, {output_name}}},
-                      {{"use_mkldnn", {true}}})
+                      {{"use_onednn", {true}}})
                 : framework::OpRegistry::CreateOp(
                       op_type,
                       {{first_input_var_name, {first_input}}},
                       {{output_var_name, {output_name}}},
-                      {{"use_mkldnn", {true}}});
+                      {{"use_onednn", {true}}});
 
   op->Run(scope, place);
   pool.Get(place)->Wait();
diff --git a/test/cpp/fluid/mkldnn/test_onednn_conv2d_transpose_bias.cc b/test/cpp/fluid/mkldnn/test_onednn_conv2d_transpose_bias.cc
index 87311b8e9a2acd..6e5218c157f41e 100644
--- a/test/cpp/fluid/mkldnn/test_onednn_conv2d_transpose_bias.cc
+++ b/test/cpp/fluid/mkldnn/test_onednn_conv2d_transpose_bias.cc
@@ -61,7 +61,7 @@ void test_conv2d_transpose_bias() {
   AddVarToScope<float>("convtranspose-Bias", &scope, {256});
   AddVarToScope<float>("convtranspose-Out", &scope, {1, 256, 27, 23});
 
-  desc.SetAttr("use_mkldnn", true);
+  desc.SetAttr("use_onednn", true);
   desc.SetAttr("is_test", true);
 
   auto op = paddle::framework::OpRegistry::CreateOp(desc);
diff --git a/test/cpp/fluid/mkldnn/test_onednn_op_inplace.cc b/test/cpp/fluid/mkldnn/test_onednn_op_inplace.cc
index 54ff2aa51bb8e4..90e296790107e2 100644
--- a/test/cpp/fluid/mkldnn/test_onednn_op_inplace.cc
+++ b/test/cpp/fluid/mkldnn/test_onednn_op_inplace.cc
@@ -87,11 +87,11 @@ bool TestMain(const phi::Place &place,
           ? framework::OpRegistry::CreateOp(op_type,
                                             {{"X", {"x"}}, {"Y", {"x1"}}},
                                             {{"Out", {"y"}}},
-                                            {{"use_mkldnn", {true}}})
+                                            {{"use_onednn", {true}}})
           : framework::OpRegistry::CreateOp(op_type,
                                             {{"X", {"x"}}},
                                             {{"Out", {"y"}}},
-                                            {{"use_mkldnn", {true}}});
+                                            {{"use_onednn", {true}}});
 
   op_ref->Run(scope, place);
   pool.Get(place)->Wait();
@@ -104,11 +104,11 @@ bool TestMain(const phi::Place &place,
                 ? framework::OpRegistry::CreateOp(op_type,
                                                   {{"X", {"x"}}, {"Y", {"x1"}}},
                                                   {{"Out", {"x"}}},
-                                                  {{"use_mkldnn", {true}}})
+                                                  {{"use_onednn", {true}}})
                 : framework::OpRegistry::CreateOp(op_type,
                                                   {{"X", {"x"}}},
                                                   {{"Out", {"x"}}},
-                                                  {{"use_mkldnn", {true}}});
+                                                  {{"use_onednn", {true}}});
 
   op->Run(scope, place);
   phi::DeviceContextPool::Instance().Get(place)->Wait();
diff --git a/test/cpp/fluid/mkldnn/test_onednn_op_nhwc.cc b/test/cpp/fluid/mkldnn/test_onednn_op_nhwc.cc
index f946a0aee1f49c..fc3073f1440759 100644
--- a/test/cpp/fluid/mkldnn/test_onednn_op_nhwc.cc
+++ b/test/cpp/fluid/mkldnn/test_onednn_op_nhwc.cc
@@ -67,7 +67,7 @@ void Test_Pool2d_Transpose_NHWC(const std::string &transpose_type) {
                                       {{"pooling_type", {std::string("max")}},
                                        {"ksize", {ksize}},
                                        {"data_format", {std::string("NHWC")}},
-                                       {"use_mkldnn", {true}}});
+                                       {"use_onednn", {true}}});
 
   auto axis = std::vector<int>(4, 0);
   axis[1] = 2;
@@ -77,7 +77,7 @@ void Test_Pool2d_Transpose_NHWC(const std::string &transpose_type) {
       transpose_type,
       {{"X", {"y"}}},
       {{"Out", {"z"}}},
-      {{"axis", {axis}}, {"use_mkldnn", {true}}});
+      {{"axis", {axis}}, {"use_onednn", {true}}});
 
   op_pool->Run(scope, p);
   op_transpose->Run(scope, p);
@@ -130,7 +130,7 @@ TEST(test_pool2d_relu_relu_nhwc, cpu_place) {
                                       {{"pooling_type", {std::string("max")}},
                                        {"ksize", {ksize}},
                                        {"data_format", {std::string("NHWC")}},
-                                       {"use_mkldnn", {true}}});
+                                       {"use_onednn", {true}}});
 
   auto axis = std::vector<int>(4, 0);
   axis[1] = 2;
@@ -140,10 +140,10 @@ TEST(test_pool2d_relu_relu_nhwc, cpu_place) {
       "relu",
       {{"X", {"y"}}},
       {{"Out", {"u"}}},
-      {{"axis", {axis}}, {"use_mkldnn", {false}}});
+      {{"axis", {axis}}, {"use_onednn", {false}}});
 
   auto op_relu2 = framework::OpRegistry::CreateOp(
-      "relu", {{"X", {"u"}}}, {{"Out", {"z"}}}, {{"use_mkldnn", {true}}});
+      "relu", {{"X", {"u"}}}, {{"Out", {"z"}}}, {{"use_onednn", {true}}});
 
   op_pool->Run(scope, p);
   op_relu1->Run(scope, p);
@@ -192,10 +192,10 @@ TEST(test_pool2d_shape_nhwc, cpu_place) {
                                       {{"pooling_type", {std::string("max")}},
                                        {"ksize", {ksize}},
                                        {"data_format", {std::string("NHWC")}},
-                                       {"use_mkldnn", {true}}});
+                                       {"use_onednn", {true}}});
 
   auto op_shape = framework::OpRegistry::CreateOp(
-      "shape", {{"Input", {"y"}}}, {{"Out", {"z"}}}, {{"use_mkldnn", {true}}});
+      "shape", {{"Input", {"y"}}}, {{"Out", {"z"}}}, {{"use_onednn", {true}}});
 
   op_pool->Run(scope, p);
   op_shape->Run(scope, p);
diff --git a/test/cpp/fluid/mkldnn/test_onednn_pool_adaptive_op.cc b/test/cpp/fluid/mkldnn/test_onednn_pool_adaptive_op.cc
index 684ad2f1cc3775..1e45aad938ca8d 100644
--- a/test/cpp/fluid/mkldnn/test_onednn_pool_adaptive_op.cc
+++ b/test/cpp/fluid/mkldnn/test_onednn_pool_adaptive_op.cc
@@ -67,7 +67,7 @@ void test_pool2d(bool adaptive, bool ceil_mode, std::string pool_type = "max") {
   desc.SetAttr("paddings", paddings);
   desc.SetAttr("adaptive", adaptive);
   desc.SetAttr("ceil_mode", ceil_mode);
-  desc.SetAttr("use_mkldnn", true);
+  desc.SetAttr("use_onednn", true);
 
   auto op = paddle::framework::OpRegistry::CreateOp(desc);
 
diff --git a/test/cpp/fluid/mkldnn/test_onednn_squeeze.cc b/test/cpp/fluid/mkldnn/test_onednn_squeeze.cc
index 0a5b253e05bcab..b1dfd5ab5d1b79 100644
--- a/test/cpp/fluid/mkldnn/test_onednn_squeeze.cc
+++ b/test/cpp/fluid/mkldnn/test_onednn_squeeze.cc
@@ -62,7 +62,7 @@ void test_squeeze() {
   std::vector<int> axes({-2});
 
   desc.SetAttr("axes", axes);
-  desc.SetAttr("use_mkldnn", true);
+  desc.SetAttr("use_onednn", true);
 
   auto op = paddle::framework::OpRegistry::CreateOp(desc);
 
@@ -86,7 +86,7 @@ void test_squeeze2() {
   std::vector<int> axes({-1});
 
   desc.SetAttr("axes", axes);
-  desc.SetAttr("use_mkldnn", true);
+  desc.SetAttr("use_onednn", true);
 
   auto op = paddle::framework::OpRegistry::CreateOp(desc);
 
diff --git a/test/cpp/fluid/op_debug_string_test.cc b/test/cpp/fluid/op_debug_string_test.cc
index 5195a53f5826cf..8d797f97e02f47 100644
--- a/test/cpp/fluid/op_debug_string_test.cc
+++ b/test/cpp/fluid/op_debug_string_test.cc
@@ -37,7 +37,7 @@ TEST(op_debug_str, test_unknown_dtype) {
   desc.SetOutput(framework::GradVarName("X"), {framework::GradVarName("X")});
   desc.SetOutput(framework::GradVarName("Y"), {framework::GradVarName("Y")});
   desc.SetAttr("axis", -1);
-  desc.SetAttr("use_mkldnn", false);
+  desc.SetAttr("use_onednn", false);
 
   auto x_tensor = scope.Var("X")->GetMutable<phi::DenseTensor>();
   x_tensor->Resize(dim);
diff --git a/test/cpp/imperative/test_hooks.cc b/test/cpp/imperative/test_hooks.cc
index 521e505b98b894..1350bd31539fd9 100644
--- a/test/cpp/imperative/test_hooks.cc
+++ b/test/cpp/imperative/test_hooks.cc
@@ -104,7 +104,7 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
   NameVarBaseMap ins = {x_pair, y_pair};
   NameVarBaseMap outs = {out_pair};
   framework::AttributeMap mul_attr_map;
-  mul_attr_map["use_mkldnn"] = false;
+  mul_attr_map["use_onednn"] = false;
 
   // add VariableWrapper hook
   x->GradVarBase()->AddVariableWrapperHook(
@@ -211,7 +211,7 @@ void GradVarLeafBackwardHookWithGradAccumulatedTest() {
   NameVarBaseMap ins = {x_pair, y_pair};
   NameVarBaseMap outs = {out_xy_pair};
   framework::AttributeMap mul_attr_map;
-  mul_attr_map["use_mkldnn"] = false;
+  mul_attr_map["use_onednn"] = false;
   tracer.TraceOp<VarBase>("mul", ins, outs, mul_attr_map, place, true);
 
   var_pair z_pair = var_pair("Y", vb_vector(1, z));
diff --git a/test/cpp/imperative/test_prepare_op.cc b/test/cpp/imperative/test_prepare_op.cc
index 1393683e44100e..ae1030abac0e27 100644
--- a/test/cpp/imperative/test_prepare_op.cc
+++ b/test/cpp/imperative/test_prepare_op.cc
@@ -247,7 +247,7 @@ TEST(test_prepare_op, test_complex_eager) {
 
 #ifdef PADDLE_WITH_DNNL
 TEST(test_prepare_op, test_prepare_data_cpu_onednn) {
-  TestPrepareDataSamePlace({{"use_mkldnn", true}});
+  TestPrepareDataSamePlace({{"use_onednn", true}});
 }
 #endif
 }  // namespace imperative
diff --git a/test/cpp/imperative/test_tracer.cc b/test/cpp/imperative/test_tracer.cc
index 305334c6a92bb7..ecca7eb41eb441 100644
--- a/test/cpp/imperative/test_tracer.cc
+++ b/test/cpp/imperative/test_tracer.cc
@@ -89,7 +89,7 @@ TEST(test_tracer, test_trace_op) {
   imperative::NameVarBaseMap ins = {x_pair, y_pair};
   imperative::NameVarBaseMap outs = {out_pair};
   framework::AttributeMap mul_attr_map;
-  mul_attr_map["use_mkldnn"] = false;
+  mul_attr_map["use_onednn"] = false;
   tracer.TraceOp<VarBase>("mul", ins, outs, mul_attr_map, place, true);
 
 #ifndef PADDLE_WITH_XPU
@@ -141,7 +141,7 @@ TEST(test_tracer, test_trace_op_with_backward) {
   imperative::NameVarBaseMap ins = {x_pair, y_pair};
   imperative::NameVarBaseMap outs = {out_pair};
   framework::AttributeMap mul_attr_map;
-  mul_attr_map["use_mkldnn"] = false;
+  mul_attr_map["use_onednn"] = false;
   tracer.TraceOp<VarBase>("mul", ins, outs, mul_attr_map, place, true);
   const auto& out_tensor = vout->Var().Get<phi::DenseTensor>();
   for (int i = 0; i < vout->Var().Get<phi::DenseTensor>().numel(); i++) {
@@ -187,7 +187,7 @@ TEST(test_tracer, test_track_backward_output) {
   imperative::NameVarBaseMap ins = {x_pair, y_pair};
   imperative::NameVarBaseMap outs = {out_pair};
   framework::AttributeMap mul_attr_map;
-  mul_attr_map["use_mkldnn"] = false;
+  mul_attr_map["use_onednn"] = false;
   tracer.TraceOp<VarBase>("mul", ins, outs, mul_attr_map, place, true);
   ASSERT_EQ(x_in->GradVarBase()->GradOpNum(), 0UL);
   ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
@@ -232,7 +232,7 @@ TEST(test_tracer, test_track_backward_input) {
   imperative::NameVarBaseMap ins = {x_pair, y_pair};
   imperative::NameVarBaseMap outs = {out_pair};
   framework::AttributeMap mul_attr_map;
-  mul_attr_map["use_mkldnn"] = false;
+  mul_attr_map["use_onednn"] = false;
   tracer.TraceOp<VarBase>("mul", ins, outs, mul_attr_map, place, true);
 
   ASSERT_EQ(x_in->GradVarBase()->GradOpNum(), 0UL);
@@ -280,7 +280,7 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
   imperative::NameVarBaseMap ins = {x_pair, y_pair};
   imperative::NameVarBaseMap outs = {out_pair};
   framework::AttributeMap mul_attr_map;
-  mul_attr_map["use_mkldnn"] = false;
+  mul_attr_map["use_onednn"] = false;
   tracer.TraceOp<VarBase>(
       "elementwise_add", ins, outs, mul_attr_map, gpu_place, true);
 
@@ -417,7 +417,7 @@ TEST(test_tracer, test_var_without_grad_var) {
   imperative::NameVarBaseMap ins = {x_pair, y_pair};
   imperative::NameVarBaseMap outs = {out_pair};
   framework::AttributeMap mul_attr_map;
-  mul_attr_map["use_mkldnn"] = false;
+  mul_attr_map["use_onednn"] = false;
   tracer.TraceOp<VarBase>("mul", ins, outs, mul_attr_map, place, true);
 
   const auto& out_tensor = vout->Var().Get<phi::DenseTensor>();
@@ -636,7 +636,7 @@ TEST(test_tracer, eager_tracer) {
   imperative::NameTensorMap ins = {x_pair, y_pair};
   imperative::NameTensorMap outs = {out_pair};
   framework::AttributeMap mul_attr_map;
-  mul_attr_map["use_mkldnn"] = false;
+  mul_attr_map["use_onednn"] = false;
   tracer.TraceOp<egr::EagerVariable>(
       "mul", ins, outs, mul_attr_map, place, true);
 
diff --git a/test/cpp/pir/cinn/CMakeLists.txt b/test/cpp/pir/cinn/CMakeLists.txt
index 0b3591f64f0f2c..2871d040551ee2 100644
--- a/test/cpp/pir/cinn/CMakeLists.txt
+++ b/test/cpp/pir/cinn/CMakeLists.txt
@@ -47,6 +47,10 @@ if(WITH_TESTING AND WITH_CINN)
   paddle_test(eliminate_common_factor_of_local_index_test SRCS
               eliminate_common_factor_of_local_index_test.cc)
 
+  paddle_test(ir_simplify_select_test SRCS ir_simplify_select_test.cc)
+
+  paddle_test(ir_simplify_bound_test SRCS ir_simplify_bound_test.cc)
+
   # DO NOT forget add test name here, otherwise it will not be executed in
   # CINN CI.
   set(cinn_unit_tests
diff --git a/test/cpp/pir/cinn/adt/index_expr_test.cc b/test/cpp/pir/cinn/adt/index_expr_test.cc
index 3bc2f4ab4e7ae3..a38041f669b20b 100644
--- a/test/cpp/pir/cinn/adt/index_expr_test.cc
+++ b/test/cpp/pir/cinn/adt/index_expr_test.cc
@@ -52,6 +52,7 @@ class TestIndexExpr : public ::testing::Test {
 
   ir::Var S4, S5, S6, S7, S8, S9, f;
 };
+
 TEST_F(TestIndexExpr, IndexExpr_0) {
   ir::IndexExpr a(14);
   ir::IndexExpr b(7);
@@ -643,10 +644,11 @@ TEST_F(TestIndexExpr, MatchPattern) {
   EXPECT_EQ(result9->at("x"), x);
   EXPECT_EQ(result9->at("y"), y);
 }
+
 TEST_F(TestIndexExpr, BoundSimplify) {
   ir::Var S0 = ir::Var("S0");
-  ir::Var i = ir::Var(ir::Expr(0), ir::Expr(5), "i");
-  ir::Var j = ir::Var(ir::Expr(0), S0, "j");
+  ir::Var i = ir::Var(ir::Expr(0), ir::Expr(5), "i");  // i ∈ [0, 5)
+  ir::Var j = ir::Var(ir::Expr(0), S0, "j");           // j ∈ [0, S0)
 
   ir::Expr q0 = i / Expr(5);
   ir::Expr q1 = i / Expr(4);
diff --git a/test/cpp/pir/cinn/adt/iter_simplify_test.cc b/test/cpp/pir/cinn/adt/iter_simplify_test.cc
index 248855b703ff3b..b09bc9d6f521c7 100644
--- a/test/cpp/pir/cinn/adt/iter_simplify_test.cc
+++ b/test/cpp/pir/cinn/adt/iter_simplify_test.cc
@@ -47,11 +47,12 @@ class TestIterSimplify : public ::testing::Test {
     i_j_k_fused =
         ir::Var(ir::Expr(0), ir::Expr(64), "i_j_k_fused").set_index(1);
     var_intervals = {
-        {"i", CasInterval(i->lower_bound, i->upper_bound)},
-        {"j", CasInterval(j->lower_bound, j->upper_bound)},
-        {"k", CasInterval(k->lower_bound, k->upper_bound)},
+        {"i", CasInterval(i->lower_bound, i->upper_bound - ir::Expr(1))},
+        {"j", CasInterval(j->lower_bound, j->upper_bound - ir::Expr(1))},
+        {"k", CasInterval(k->lower_bound, k->upper_bound - ir::Expr(1))},
         {"i_j_k_fused",
-         CasInterval(i_j_k_fused->lower_bound, i_j_k_fused->upper_bound)}};
+         CasInterval(i_j_k_fused->lower_bound,
+                     i_j_k_fused->upper_bound - ir::Expr(1))}};
   };
 
   ir::Var i;
diff --git a/test/cpp/pir/cinn/ir_simplify_bound_test.cc b/test/cpp/pir/cinn/ir_simplify_bound_test.cc
new file mode 100644
index 00000000000000..42206af0b9d9b7
--- /dev/null
+++ b/test/cpp/pir/cinn/ir_simplify_bound_test.cc
@@ -0,0 +1,191 @@
+// Copyright (c) 2025 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/optim/ir_simplify.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/op/ir_operators.h"
+#include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/cinn/ir/utils/ir_nodes_collector.h"
+#include "paddle/cinn/ir/utils/stmt_converter.h"
+#include "paddle/cinn/utils/string.h"
+
+namespace cinn {
+namespace optim {
+
+/*
+i_j_fused: [0ll, 524288ll)
+j_0: [0, 128)
+Before Normalize:
+(j_0 % 128)
+After Normalize:
+j_0
+*/
+TEST(IRSimplifyBound, SimplifyMod) {
+  Context::Global().ResetNameId();
+
+  // Create input IR matching the specified pattern
+  // Define loop variable
+  ir::Var var_j_0 = ir::Var(ir::Expr(0), ir::Expr(128), "j_0");
+
+  // Final expression
+  ir::Expr expr = ir::Mod::Make(var_j_0, ir::Expr(128));
+
+  VLOG(6) << "Before Simplify: " << expr;
+  auto res = expr.as_index().ir::IndexExpr::Normalize(
+      ir::IndexExpr::OptLevel::kLevel3);
+  VLOG(6) << "After Simplify: " << res;
+
+  // Expected output verification
+  std::string expected_ir = R"ROC(j_0)ROC";
+
+  EXPECT_EQ(utils::GetStreamCnt(res), utils::Trim(expected_ir));
+}
+
+/*
+i_j_fused: [0ll, 524288ll)
+j_0: [0, 128)
+Before Normalize:
+(j_0 / 128)
+After Normalize:
+0
+*/
+TEST(IRSimplifyBound, SimplifyDiv) {
+  Context::Global().ResetNameId();
+
+  // Create input IR matching the specified pattern
+  // Define loop variable
+  ir::Var var_j_0 = ir::Var(ir::Expr(0), ir::Expr(128), "j_0");
+
+  // Final expression
+  ir::Expr expr = ir::Div::Make(var_j_0, ir::Expr(128));
+
+  VLOG(6) << "Before Normalize: " << expr;
+  auto res = expr.as_index().ir::IndexExpr::Normalize(
+      ir::IndexExpr::OptLevel::kLevel3);
+  VLOG(6) << "After Normalize: " << res;
+
+  // Expected output verification
+  std::string expected_ir = R"ROC(0)ROC";
+
+  EXPECT_EQ(utils::GetStreamCnt(res), utils::Trim(expected_ir));
+}
+
+/*
+i_j_fused: [0ll, 524288ll)
+j_0: [0, 128)
+Before Normalize:
+((((i_j_fused % 16) * 128) + j_0) / 128)
+After Normalize:
+(i_j_fused % 16)
+*/
+TEST(IRSimplifyBound, SimplifyLinearDiv) {
+  Context::Global().ResetNameId();
+
+  // Create input IR matching the specified pattern
+  // Define loop variables
+  ir::Var var_i_j_fused = ir::Var(ir::Expr(0), ir::Expr(524288), "i_j_fused");
+  ir::Var var_j_0 = ir::Var(ir::Expr(0), ir::Expr(128), "j_0");
+
+  // Final expression
+  ir::Expr expr = ir::Div::Make(
+      ir::Add::Make(ir::Mul::Make(ir::Mod::Make(var_i_j_fused, ir::Expr(16)),
+                                  ir::Expr(128)),
+                    var_j_0),
+      ir::Expr(128));
+
+  VLOG(6) << "Before Normalize: " << expr;
+  auto res = expr.as_index().ir::IndexExpr::Normalize(
+      ir::IndexExpr::OptLevel::kLevel3);
+  VLOG(6) << "After Normalize: " << res;
+
+  // Expected output verification
+  std::string expected_ir = R"ROC((i_j_fused % 16))ROC";
+
+  EXPECT_EQ(utils::GetStreamCnt(res), utils::Trim(expected_ir));
+}
+
+/*
+i_j_fused: [0ll, 524288ll)
+j_0: [0, 128)
+Before Normalize:
+((((i_j_fused % 16) * 128) + j_0) % 128)
+After Normalize:
+j_0
+*/
+TEST(IRSimplifyBound, SimplifyLinearMod) {
+  Context::Global().ResetNameId();
+
+  // Create input IR matching the specified pattern
+  // Define loop variables
+  ir::Var var_i_j_fused = ir::Var(ir::Expr(0), ir::Expr(524288), "i_j_fused");
+  ir::Var var_j_0 = ir::Var(ir::Expr(0), ir::Expr(128), "j_0");
+
+  // Final expression
+  ir::Expr expr = ir::Mod::Make(
+      ir::Add::Make(ir::Mul::Make(ir::Mod::Make(var_i_j_fused, ir::Expr(16)),
+                                  ir::Expr(128)),
+                    var_j_0),
+      ir::Expr(128));
+
+  VLOG(6) << "Before Normalize: " << expr;
+  auto res = expr.as_index().ir::IndexExpr::Normalize(
+      ir::IndexExpr::OptLevel::kLevel3);
+  VLOG(6) << "After Normalize: " << res;
+
+  // Expected output verification
+  std::string expected_ir = R"ROC(j_0)ROC";
+
+  EXPECT_EQ(utils::GetStreamCnt(res), utils::Trim(expected_ir));
+}
+
+/*
+loop_var_2: [0, 32)
+loop_var_3: [0, 4)
+Before Normalize:
+(((loop_var_3 * 32ll) + loop_var_2) / 128ll)
+After Normalize:
+0
+*/
+TEST(IRSimplifyBound, SimplifyLinearDiv2) {
+  Context::Global().ResetNameId();
+
+  // Create input IR matching the specified pattern
+  // Define loop variables
+  ir::Var loop_var_2 = ir::Var(ir::Expr(0), ir::Expr(32), "loop_var_2");
+  ir::Var loop_var_3 = ir::Var(ir::Expr(0), ir::Expr(4), "loop_var_3");
+
+  // Final expression
+  ir::Expr expr = ir::Div::Make(
+      ir::Add::Make(ir::Mul::Make(loop_var_3, ir::Expr(32)), loop_var_2),
+      ir::Expr(128));
+
+  VLOG(6) << "Before Normalize: " << expr;
+  auto res = expr.as_index().ir::IndexExpr::Normalize(
+      ir::IndexExpr::OptLevel::kLevel3);
+  VLOG(6) << "After Normalize: " << res;
+
+  // Expected output verification
+  std::string expected_ir = R"ROC(0)ROC";
+
+  EXPECT_EQ(utils::GetStreamCnt(res), utils::Trim(expected_ir));
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/test/cpp/pir/cinn/ir_simplify_select_test.cc b/test/cpp/pir/cinn/ir_simplify_select_test.cc
new file mode 100644
index 00000000000000..0f236e9d266865
--- /dev/null
+++ b/test/cpp/pir/cinn/ir_simplify_select_test.cc
@@ -0,0 +1,336 @@
+// Copyright (c) 2025 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/optim/ir_simplify.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/op/ir_operators.h"
+#include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/cinn/ir/utils/ir_nodes_collector.h"
+#include "paddle/cinn/ir/utils/stmt_converter.h"
+#include "paddle/cinn/utils/string.h"
+
+namespace cinn {
+namespace optim {
+
+/*
+serial for (i, 0ll, 32768ll) {
+  serial for (j, 0ll, 16ll) {
+    serial for (reduce_k_0, 0ll, 128ll) {
+        var_18[i, j] = select((var_18[i, j] > var_17[i, j, reduce_k_0]),
+var_18[i, j], var_17[i, j, reduce_k_0])
+      }
+    }
+  }
+}
+*/
+TEST(IRSimplifySelect, SimplifySelectToMax) {
+  Context::Global().ResetNameId();
+
+  // Create input IR matching the specified pattern
+  const std::vector<ir::Expr> shape_2d = {ir::Expr(32768), ir::Expr(16)};
+  const std::vector<ir::Expr> shape_3d = {
+      ir::Expr(32768), ir::Expr(16), ir::Expr(128)};
+
+  ir::Tensor var_17 =
+      ir::_Tensor_::Make("var_17", ir::Float(32), shape_3d, shape_3d);
+  var_17->WithBuffer("global", "var_17_buffer");
+
+  ir::Tensor var_18 =
+      ir::_Tensor_::Make("var_18", ir::Float(32), shape_2d, shape_2d);
+  var_18->WithBuffer("global", "var_18_buffer");
+
+  // Define loop variables
+  ir::Var var_i = ir::Var(ir::Expr(0), ir::Expr(32768), "i");
+  ir::Var var_j = ir::Var(ir::Expr(0), ir::Expr(16), "j");
+  ir::Var var_reduce_k_0 = ir::Var(ir::Expr(0), ir::Expr(128), "reduce_k_0");
+
+  // Create innermost reduction loop body
+  ir::Expr reduce_body = ir::Store::Make(
+      var_18,
+      ir::Select::Make(
+          ir::GT::Make(ir::Load::Make(var_18, {var_i, var_j}),
+                       ir::Load::Make(var_17, {var_i, var_j, var_reduce_k_0})),
+          ir::Load::Make(var_18, {var_i, var_j}),
+          ir::Load::Make(var_17, {var_i, var_j, var_reduce_k_0})),
+      {var_i, var_j});
+
+  // Create reduction loop
+  ir::Expr reduce_loop = ir::For::Make(var_reduce_k_0,
+                                       ir::Expr(0),
+                                       ir::Expr(128),
+                                       ir::ForType::Serial,
+                                       ir::DeviceAPI::Host,
+                                       ir::Block::Make({reduce_body}));
+
+  // Create j loop
+  ir::Expr j_loop = ir::For::Make(var_j,
+                                  ir::Expr(0),
+                                  ir::Expr(16),
+                                  ir::ForType::Serial,
+                                  ir::DeviceAPI::Host,
+                                  ir::Block::Make({reduce_loop}));
+
+  // Create i loop
+  ir::Expr i_loop = ir::For::Make(var_i,
+                                  ir::Expr(0),
+                                  ir::Expr(32768),
+                                  ir::ForType::Serial,
+                                  ir::DeviceAPI::Host,
+                                  ir::Block::Make({j_loop}));
+
+  // Final expression
+  ir::Expr expr = ir::Block::Make({i_loop});
+
+  VLOG(6) << "Before Simplify: " << expr;
+  Simplify(&expr);
+  VLOG(6) << "After Simplify: " << expr;
+
+  // Expected output verification
+  std::string expected_ir = R"ROC({
+  serial for (i, 0, 32768)
+  {
+    serial for (j, 0, 16)
+    {
+      serial for (reduce_k_0, 0, 128)
+      {
+        var_18[i, j] = cinn_max(var_17[i, j, reduce_k_0], var_18[i, j])
+      }
+    }
+  }
+})ROC";
+
+  EXPECT_EQ(utils::GetStreamCnt(expr), utils::Trim(expected_ir));
+}
+
+/*
+serial for (i, 0ll, 32768ll) {
+  serial for (j, 0ll, 16ll) {
+    serial for (reduce_k_0, 0ll, 128ll) {
+        var_18[i, j] = select((var_18[i, j] < var_17[i, j, reduce_k_0]),
+var_18[i, j], var_17[i, j, reduce_k_0])
+      }
+    }
+  }
+}
+*/
+TEST(IRSimplifySelect, SimplifySelectToMin) {
+  Context::Global().ResetNameId();
+
+  // Create input IR matching the specified pattern
+  const std::vector<ir::Expr> shape_2d = {ir::Expr(32768), ir::Expr(16)};
+  const std::vector<ir::Expr> shape_3d = {
+      ir::Expr(32768), ir::Expr(16), ir::Expr(128)};
+
+  ir::Tensor var_17 =
+      ir::_Tensor_::Make("var_17", ir::Float(32), shape_3d, shape_3d);
+  var_17->WithBuffer("global", "var_17_buffer");
+
+  ir::Tensor var_18 =
+      ir::_Tensor_::Make("var_18", ir::Float(32), shape_2d, shape_2d);
+  var_18->WithBuffer("global", "var_18_buffer");
+
+  // Define loop variables
+  ir::Var var_i = ir::Var(ir::Expr(0), ir::Expr(32768), "i");
+  ir::Var var_j = ir::Var(ir::Expr(0), ir::Expr(16), "j");
+  ir::Var var_reduce_k_0 = ir::Var(ir::Expr(0), ir::Expr(128), "reduce_k_0");
+
+  // Create innermost reduction loop body
+  ir::Expr reduce_body = ir::Store::Make(
+      var_18,
+      ir::Select::Make(
+          ir::LT::Make(ir::Load::Make(var_18, {var_i, var_j}),
+                       ir::Load::Make(var_17, {var_i, var_j, var_reduce_k_0})),
+          ir::Load::Make(var_18, {var_i, var_j}),
+          ir::Load::Make(var_17, {var_i, var_j, var_reduce_k_0})),
+      {var_i, var_j});
+
+  // Create reduction loop
+  ir::Expr reduce_loop = ir::For::Make(var_reduce_k_0,
+                                       ir::Expr(0),
+                                       ir::Expr(128),
+                                       ir::ForType::Serial,
+                                       ir::DeviceAPI::Host,
+                                       ir::Block::Make({reduce_body}));
+
+  // Create j loop
+  ir::Expr j_loop = ir::For::Make(var_j,
+                                  ir::Expr(0),
+                                  ir::Expr(16),
+                                  ir::ForType::Serial,
+                                  ir::DeviceAPI::Host,
+                                  ir::Block::Make({reduce_loop}));
+
+  // Create i loop
+  ir::Expr i_loop = ir::For::Make(var_i,
+                                  ir::Expr(0),
+                                  ir::Expr(32768),
+                                  ir::ForType::Serial,
+                                  ir::DeviceAPI::Host,
+                                  ir::Block::Make({j_loop}));
+
+  // Final expression
+  ir::Expr expr = ir::Block::Make({i_loop});
+
+  VLOG(6) << "Before Simplify: " << expr;
+  Simplify(&expr);
+  VLOG(6) << "After Simplify: " << expr;
+
+  // Expected output verification
+  std::string expected_ir = R"ROC({
+  serial for (i, 0, 32768)
+  {
+    serial for (j, 0, 16)
+    {
+      serial for (reduce_k_0, 0, 128)
+      {
+        var_18[i, j] = cinn_min(var_18[i, j], var_17[i, j, reduce_k_0])
+      }
+    }
+  }
+})ROC";
+
+  EXPECT_EQ(utils::GetStreamCnt(expr), utils::Trim(expected_ir));
+}
+
+/*
+serial for (i, 0ll, 32768ll)
+{
+    serial for (j, 0, 16)
+    {
+        serial for (j_0, 0, 128)
+        {
+            var_45[i, j, j_0)] = select(
+                (var_18[i, ((((j * 128ll) + j_0) / 128ll) + 0ll)] <=
+                 float32(3.4028234663852886e+38)),
+                select(
+                    (var_18[i, ((((j * 128ll) + j_0) / 128ll) + 0ll)] >=
+                     float32(9.9999997473787516e-05)),
+                    var_18[i, ((((j * 128ll) + j_0) / 128ll) + 0ll)],
+                    float32(9.9999997473787516e-05)
+                ),
+                float32(3.4028234663852886e+38)
+            )
+        }
+    }
+}
+*/
+TEST(IRSimplifySelect, SimplifySelectToMinMax) {
+  Context::Global().ResetNameId();
+
+  // Create input IR matching the specified pattern
+  const std::vector<ir::Expr> shape_2d = {ir::Expr(32768), ir::Expr(16)};
+  const std::vector<ir::Expr> shape_3d = {
+      ir::Expr(32768), ir::Expr(16), ir::Expr(128)};
+
+  ir::Tensor var_18 =
+      ir::_Tensor_::Make("var_18", ir::Float(32), shape_2d, shape_2d);
+  var_18->WithBuffer("global", "var_18_buffer");
+
+  ir::Tensor var_45 =
+      ir::_Tensor_::Make("var_45", ir::Float(32), shape_3d, shape_3d);
+  var_45->WithBuffer("global", "var_45_buffer");
+
+  // Define loop variables
+  ir::Var var_i = ir::Var(ir::Expr(0), ir::Expr(32768), "i");
+  ir::Var var_j = ir::Var(ir::Expr(0), ir::Expr(16), "j");
+  ir::Var var_j_0 = ir::Var(ir::Expr(0), ir::Expr(128), "j_0");
+
+  // Create innermost loop body
+  ir::Expr body = ir::Store::Make(
+      var_45,
+      ir::Select::Make(
+          ir::LE::Make(
+              ir::Load::Make(
+                  var_18,
+                  {var_i,
+                   ir::Div::Make(
+                       ir::Add::Make(ir::Mul::Make(var_j, ir::Expr(128)),
+                                     var_j_0),
+                       ir::Expr(128))}),
+              ir::Expr(3.4028234663852886e+38f)),
+          ir::Select::Make(
+              ir::GE::Make(
+                  ir::Load::Make(
+                      var_18,
+                      {var_i,
+                       ir::Div::Make(
+                           ir::Add::Make(ir::Mul::Make(var_j, ir::Expr(128)),
+                                         var_j_0),
+                           ir::Expr(128))}),
+                  ir::Expr(9.9999997473787516e-05f)),
+              ir::Load::Make(
+                  var_18,
+                  {var_i,
+                   ir::Div::Make(
+                       ir::Add::Make(ir::Mul::Make(var_j, ir::Expr(128)),
+                                     var_j_0),
+                       ir::Expr(128))}),
+              ir::Expr(9.9999997473787516e-05f)),
+          ir::Expr(3.4028234663852886e+38f)),
+      {var_i, var_j, var_j_0});
+
+  // Create j_0 loop
+  ir::Expr j_0_loop = ir::For::Make(var_j_0,
+                                    ir::Expr(0),
+                                    ir::Expr(128),
+                                    ir::ForType::Serial,
+                                    ir::DeviceAPI::Host,
+                                    ir::Block::Make({body}));
+
+  // Create j loop
+  ir::Expr j_loop = ir::For::Make(var_j,
+                                  ir::Expr(0),
+                                  ir::Expr(16),
+                                  ir::ForType::Serial,
+                                  ir::DeviceAPI::Host,
+                                  ir::Block::Make({j_0_loop}));
+
+  // Create i loop
+  ir::Expr i_loop = ir::For::Make(var_i,
+                                  ir::Expr(0),
+                                  ir::Expr(32768),
+                                  ir::ForType::Serial,
+                                  ir::DeviceAPI::Host,
+                                  ir::Block::Make({j_loop}));
+
+  // Final expression
+  ir::Expr expr = ir::Block::Make({i_loop});
+
+  VLOG(6) << "Before Simplify: " << expr;
+  Simplify(&expr);
+  VLOG(6) << "After Simplify: " << expr;
+
+  // Expected output verification
+  std::string expected_ir = R"ROC({
+  serial for (i, 0, 32768)
+  {
+    serial for (j, 0, 16)
+    {
+      serial for (j_0, 0, 128)
+      {
+        var_45[i, j, j_0] = cinn_min(cinn_max(var_18[i, (((j * 128) + j_0) / 128)], 9.99999975e-05f), 3.40282347e+38f)
+      }
+    }
+  }
+})ROC";
+
+  EXPECT_EQ(utils::GetStreamCnt(expr), utils::Trim(expected_ir));
+}
+}  // namespace optim
+}  // namespace cinn
diff --git a/test/cpp/pir/cinn/ir_simplify_test.cc b/test/cpp/pir/cinn/ir_simplify_test.cc
index e682079e72a90a..485216814f0102 100644
--- a/test/cpp/pir/cinn/ir_simplify_test.cc
+++ b/test/cpp/pir/cinn/ir_simplify_test.cc
@@ -479,5 +479,98 @@ TEST(IRSimplify, if_fold_EQ_2) {
 }
 )ROC"));
 }
+
+/*
+serial for (i_j_fused, 0ll, 524288ll)
+{
+  serial for (j_0, 0, 128)
+  {
+    var_45[(i_j_fused / 16), (((i_j_fused % 16) * 128) + j_0)] =
+      pow(2.0f, ceil(log2((0.00223214296f * var_31[0]))))
+  }
+ }
+*/
+TEST(IRSimplifyPowerCeilLog2BitOpLdexpf, Base) {
+  Context::Global().ResetNameId();
+
+  /// Create input IR matching the specified pattern
+  const std::vector<ir::Expr> shape_2d = {ir::Expr(32768), ir::Expr(16)};
+  const std::vector<ir::Expr> shape_3d = {ir::Expr(32768), ir::Expr(16)};
+
+  ir::Tensor var_31 =
+      ir::_Tensor_::Make("var_31", ir::Float(32), shape_2d, shape_2d);
+  var_31->WithBuffer("global", "var_31_buffer");
+
+  ir::Tensor var_45 =
+      ir::_Tensor_::Make("var_45", ir::Float(32), shape_3d, shape_3d);
+  var_45->WithBuffer("global", "var_45_buffer");
+
+  // Define loop variables
+  ir::Var var_i_j_fused = ir::Var(ir::Expr(0), ir::Expr(524288), "i_j_fused");
+  ir::Var var_j_0 = ir::Var(ir::Expr(0), ir::Expr(128), "j_0");
+
+  // Create innermost loop body
+  ir::Expr body = ir::Store::Make(
+      var_45,
+      ir::Call::Make(
+          ir::Float(32),  // Return type
+          "pow",          // Intrinsic function name
+          {ir::Expr(2.0f),
+           ir::Call::Make(
+               ir::Float(32),
+               "ceil",
+               {ir::Call::Make(
+                   ir::Float(32),
+                   "log2",
+                   {ir::Mul::Make(ir::Expr(0.00223214296f),
+                                  ir::Load::Make(var_31, {ir::Expr(0)}))},
+                   {},
+                   ir::CallType::Intrinsic)},
+               {},
+               ir::CallType::Intrinsic)},
+          {},
+          ir::CallType::Intrinsic),
+      {ir::Div::Make(var_i_j_fused, ir::Expr(16)),
+       ir::Add::Make(ir::Mul::Make(ir::Mod::Make(var_i_j_fused, ir::Expr(16)),
+                                   ir::Expr(128)),
+                     var_j_0)});
+
+  // Create j_0 loop
+  ir::Expr j_0_loop = ir::For::Make(var_j_0,
+                                    ir::Expr(0),
+                                    ir::Expr(128),
+                                    ir::ForType::Serial,
+                                    ir::DeviceAPI::Host,
+                                    ir::Block::Make({body}));
+
+  // Create i_j_fused loop
+  ir::Expr i_j_fused_loop = ir::For::Make(var_i_j_fused,
+                                          ir::Expr(0),
+                                          ir::Expr(524288),
+                                          ir::ForType::Serial,
+                                          ir::DeviceAPI::Host,
+                                          ir::Block::Make({j_0_loop}));
+
+  // Final expression
+  ir::Expr expr = ir::Block::Make({i_j_fused_loop});
+
+  VLOG(6) << "Before Simplify: " << expr;
+  cinn::optim::Simplify(&expr);
+  VLOG(6) << "After Simplify: " << expr;
+
+  // Expected output verification
+  std::string expected_ir = R"ROC({
+  serial for (i_j_fused, 0, 524288)
+  {
+    serial for (j_0, 0, 128)
+    {
+      var_45[(i_j_fused / 16), (((i_j_fused % 16) * 128) + j_0)] = ldexpf(1.00000000f, ((bitwise_and(right_shift(__float_as_uint((0.00223214296f * var_31[0])), 23), 255) - 127) + select((((bitwise_and(right_shift(__float_as_uint((0.00223214296f * var_31[0])), 23), 255) - 127) != -127) and (bitwise_and(__float_as_uint((0.00223214296f * var_31[0])), 8388607) != 0)), 1, 0)))
+    }
+  }
+})ROC";
+
+  EXPECT_EQ(utils::GetStreamCnt(expr), utils::Trim(expected_ir));
+}
+
 }  // namespace common
 }  // namespace cinn
diff --git a/test/deprecated/cpp/inference/api/analysis_predictor_tester.cc b/test/deprecated/cpp/inference/api/analysis_predictor_tester.cc
index 38978395b5ac7c..903cb9357cceea 100644
--- a/test/deprecated/cpp/inference/api/analysis_predictor_tester.cc
+++ b/test/deprecated/cpp/inference/api/analysis_predictor_tester.cc
@@ -317,7 +317,7 @@ TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
   config.SetModel(FLAGS_dirname);
   config.SwitchIrOptim(true);
   config.EnableUseGpu(100, 0);
-  config.EnableMkldnnBfloat16();
+  config.EnableOnednnBfloat16();
 #ifdef PADDLE_WITH_DNNL
   if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_core))
     ASSERT_EQ(config.onednn_bfloat16_enabled(), true);
@@ -332,7 +332,7 @@ TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
 TEST(AnalysisPredictor, bf16_pass_strategy) {
   std::vector<std::string> passes;
   PassStrategy passStrategy(passes);
-  passStrategy.EnableMkldnnBfloat16();
+  passStrategy.EnableOnednnBfloat16();
 }
 
 TEST(AnalysisPredictor, onednn_fc_pass_strategy) {
diff --git a/test/deprecated/cpp/inference/api/analyzer_bert_tester_deprecated.cc b/test/deprecated/cpp/inference/api/analyzer_bert_tester_deprecated.cc
index ec10b780a35eeb..e30b8f364c7199 100644
--- a/test/deprecated/cpp/inference/api/analyzer_bert_tester_deprecated.cc
+++ b/test/deprecated/cpp/inference/api/analyzer_bert_tester_deprecated.cc
@@ -69,7 +69,7 @@ TEST(Analyzer_bert, compare) {
   CompareNativeAndAnalysisWrapper();
 }
 #ifdef PADDLE_WITH_DNNL
-TEST(Analyzer_bert, compare_mkldnn) {
+TEST(Analyzer_bert, compare_onednn) {
   auto use_onednn = true;
   CompareNativeAndAnalysisWrapper(use_onednn);
 }
@@ -210,7 +210,7 @@ AnalysisConfig SetConfig(bool use_onednn, bool use_bfloat16) {
     config.EnableONEDNN();
   }
 
-  if (use_bfloat16) config.EnableMkldnnBfloat16();
+  if (use_bfloat16) config.EnableOnednnBfloat16();
 
   return config;
 }
diff --git a/test/deprecated/cpp/inference/api/analyzer_image_classification_tester.cc b/test/deprecated/cpp/inference/api/analyzer_image_classification_tester.cc
index 47c53d249e00c6..9915fac72873f3 100644
--- a/test/deprecated/cpp/inference/api/analyzer_image_classification_tester.cc
+++ b/test/deprecated/cpp/inference/api/analyzer_image_classification_tester.cc
@@ -37,11 +37,11 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 }
 
 // Easy for profiling independently.
-void profile(bool use_mkldnn = false) {
+void profile(bool use_onednn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
 
-  if (use_mkldnn) {
+  if (use_onednn) {
     cfg.EnableONEDNN();
     if (FLAGS_disable_onednn_fc) {
       cfg.DisableOnednnFcPasses();
@@ -59,14 +59,14 @@ void profile(bool use_mkldnn = false) {
 
 TEST(Analyzer_resnet50, profile) { profile(); }
 #ifdef PADDLE_WITH_DNNL
-TEST(Analyzer_resnet50, profile_mkldnn) { profile(true /* use_mkldnn */); }
+TEST(Analyzer_resnet50, profile_onednn) { profile(true /* use_onednn */); }
 #endif
 
 // Compare result of NativeConfig and AnalysisConfig
-void compare(bool use_mkldnn = false) {
+void compare(bool use_onednn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  if (use_mkldnn) {
+  if (use_onednn) {
     cfg.EnableONEDNN();
     if (FLAGS_disable_onednn_fc) {
       cfg.DisableOnednnFcPasses();
@@ -81,7 +81,7 @@ void compare(bool use_mkldnn = false) {
 
 TEST(Analyzer_resnet50, compare) { compare(); }
 #ifdef PADDLE_WITH_DNNL
-TEST(Analyzer_resnet50, compare_mkldnn) { compare(true /* use_mkldnn */); }
+TEST(Analyzer_resnet50, compare_onednn) { compare(true /* use_onednn */); }
 #endif
 
 // Compare Deterministic result
diff --git a/test/deprecated/cpp/inference/api/analyzer_transformer_compare_tester.cc b/test/deprecated/cpp/inference/api/analyzer_transformer_compare_tester.cc
index 04885a97ec19ba..a4dec2b4755eb5 100644
--- a/test/deprecated/cpp/inference/api/analyzer_transformer_compare_tester.cc
+++ b/test/deprecated/cpp/inference/api/analyzer_transformer_compare_tester.cc
@@ -19,10 +19,10 @@ namespace inference {
 namespace analysis {
 namespace transformer_tester {
 
-void compare(bool use_mkldnn = false) {
+void compare(bool use_onednn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  if (!use_mkldnn) {
+  if (!use_onednn) {
     cfg.DisableONEDNN();
   }
 
@@ -34,7 +34,7 @@ void compare(bool use_mkldnn = false) {
 
 TEST(Analyzer_Transformer, compare) { compare(); }
 #ifdef PADDLE_WITH_DNNL
-TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
+TEST(Analyzer_Transformer, compare_onednn) { compare(true /* use_onednn */); }
 #endif
 
 }  // namespace transformer_tester
diff --git a/test/deprecated/cpp/inference/api/analyzer_transformer_profile_tester.cc b/test/deprecated/cpp/inference/api/analyzer_transformer_profile_tester.cc
index 83f2f0041f8cce..6b6579beacc836 100644
--- a/test/deprecated/cpp/inference/api/analyzer_transformer_profile_tester.cc
+++ b/test/deprecated/cpp/inference/api/analyzer_transformer_profile_tester.cc
@@ -19,11 +19,11 @@ namespace inference {
 namespace analysis {
 namespace transformer_tester {
 
-void profile(bool use_mkldnn = false) {
+void profile(bool use_onednn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
   std::vector<std::vector<PaddleTensor>> outputs;
-  if (use_mkldnn) {
+  if (use_onednn) {
     cfg.EnableONEDNN();
   }
 
@@ -37,7 +37,7 @@ void profile(bool use_mkldnn = false) {
 
 TEST(Analyzer_Transformer, profile) { profile(); }
 #ifdef PADDLE_WITH_DNNL
-TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); }
+TEST(Analyzer_Transformer, profile_onednn) { profile(true); }
 #endif
 
 }  // namespace transformer_tester
diff --git a/test/deprecated/ir/inference/CMakeLists.txt b/test/deprecated/ir/inference/CMakeLists.txt
index 86f03ba89d9850..7fcff5451e2d2c 100755
--- a/test/deprecated/ir/inference/CMakeLists.txt
+++ b/test/deprecated/ir/inference/CMakeLists.txt
@@ -56,7 +56,7 @@ if(WIN32)
 
 endif()
 
-if(NOT WITH_MKLDNN
+if(NOT WITH_ONEDNN
    AND NOT TENSORRT_FOUND
    AND NOT WITH_GPU)
   foreach(target ${TEST_INFERENCE_CPU_UT})
diff --git a/test/deprecated/ir/inference/auto_scan_test.py b/test/deprecated/ir/inference/auto_scan_test.py
index 752b5f32d011ba..16a8dbf24c8f30 100755
--- a/test/deprecated/ir/inference/auto_scan_test.py
+++ b/test/deprecated/ir/inference/auto_scan_test.py
@@ -226,7 +226,7 @@ def create_inference_config(
         self,
         passes: list[str] | None = None,
         use_gpu: bool = False,
-        use_mkldnn: bool = False,
+        use_onednn: bool = False,
         use_xpu: bool = False,
         ir_optim: bool | None = None,
     ):
@@ -238,7 +238,7 @@ def create_inference_config(
             config.switch_ir_optim(ir_optim)
         if use_gpu:
             config.enable_use_gpu(100, 0)
-        if not use_mkldnn:
+        if not use_onednn:
             config.disable_onednn()
         if use_xpu:
             config.enable_xpu()
@@ -337,7 +337,7 @@ def run_test(self, quant=False, *args, **kwargs):
     def inference_config_str(self, config) -> str:
         dic = {}
         enable_onednn = config.onednn_enabled()
-        dic["use_mkldnn"] = enable_onednn
+        dic["use_onednn"] = enable_onednn
         enable_gpu = config.use_gpu()
         dic["use_gpu"] = enable_gpu
         return str(dic)
@@ -573,7 +573,7 @@ def run_test(self, quant=False, prog_configs=None):
     def inference_config_str(self, config) -> str:
         dic = {}
         enable_onednn = config.onednn_enabled()
-        dic["use_mkldnn"] = enable_onednn
+        dic["use_onednn"] = enable_onednn
         enable_gpu = config.use_gpu()
         dic['use_gpu'] = enable_gpu
         enable_xpu = config.use_xpu()
diff --git a/test/deprecated/ir/inference/inference_pass_test.py b/test/deprecated/ir/inference/inference_pass_test.py
index 739716382f50bd..acf9b68aefa458 100644
--- a/test/deprecated/ir/inference/inference_pass_test.py
+++ b/test/deprecated/ir/inference/inference_pass_test.py
@@ -129,7 +129,7 @@ def _get_inference_outs(self, config):
         return outs
 
     def _get_analysis_config(
-        self, use_gpu=False, use_trt=False, use_mkldnn=False
+        self, use_gpu=False, use_trt=False, use_onednn=False
     ):
         '''
         Return a new object of AnalysisConfig.
@@ -177,7 +177,7 @@ def _get_analysis_config(
                 if self.enable_tensorrt_varseqlen:
                     config.enable_tensorrt_varseqlen()
 
-        elif use_mkldnn:
+        elif use_onednn:
             config.enable_onednn()
             if self.enable_onednn_bfloat16:
                 config.enable_onednn_bfloat16()
@@ -186,7 +186,7 @@ def _get_analysis_config(
     def check_output(self, atol=1e-3):
         '''
         Check whether calculating on CPU and GPU, enable TensorRT
-        or disable TensorRT, enable MKLDNN or disable MKLDNN
+        or disable TensorRT, enable ONEDNN or disable ONEDNN
         are all the same.
         '''
         self.assertFalse(
@@ -201,7 +201,7 @@ def check_output_with_option(
     ):
         '''
         Check whether calculating on CPU and GPU, enable TensorRT
-        or disable TensorRT, enable MKLDNN or disable MKLDNN
+        or disable TensorRT, enable ONEDNN or disable ONEDNN
         are all the same.
         '''
         place = base.CUDAPlace(0) if use_gpu else base.CPUPlace()
@@ -287,13 +287,13 @@ def check_output_with_option(
         if (not use_gpu) and self.enable_mkldnn:
             onednn_outputs = self._get_inference_outs(
                 self._get_analysis_config(
-                    use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn
+                    use_gpu=use_gpu, use_onednn=self.enable_mkldnn
                 )
             )
 
             self.assertTrue(
                 len(paddle_outs) == len(onednn_outputs),
-                "The number of outputs is different between CPU and MKLDNN. ",
+                "The number of outputs is different between CPU and ONEDNN. ",
             )
 
             if self.enable_onednn_bfloat16:
@@ -304,7 +304,7 @@ def check_output_with_option(
                     onednn_output,
                     rtol=1e-05,
                     atol=atol,
-                    err_msg='Output has diff between CPU and MKLDNN. ',
+                    err_msg='Output has diff between CPU and ONEDNN. ',
                 )
 
     class TensorRTParam:
diff --git a/test/deprecated/ir/inference/quant_dequant_test.py b/test/deprecated/ir/inference/quant_dequant_test.py
index 69f2ddfaaa4fda..cb3ddc06b76f13 100644
--- a/test/deprecated/ir/inference/quant_dequant_test.py
+++ b/test/deprecated/ir/inference/quant_dequant_test.py
@@ -190,7 +190,7 @@ def _get_inference_outs(self, config):
         return outs
 
     def _get_analysis_config(
-        self, use_gpu=False, use_trt=False, use_mkldnn=False
+        self, use_gpu=False, use_trt=False, use_onednn=False
     ):
         '''
         Return a new object of AnalysisConfig.
@@ -230,7 +230,7 @@ def _get_analysis_config(
                 if self.enable_tensorrt_varseqlen:
                     config.enable_tensorrt_varseqlen()
 
-        elif use_mkldnn:
+        elif use_onednn:
             config.enable_onednn()
             if self.enable_onednn_bfloat16:
                 config.enable_onednn_bfloat16()
@@ -241,7 +241,7 @@ def check_output_with_option(
     ):
         '''
         Check whether calculating on CPU and GPU, enable TensorRT
-        or disable TensorRT, enable MKLDNN or disable MKLDNN
+        or disable TensorRT, enable ONEDNN or disable ONEDNN
         are all the same.
         '''
         place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
@@ -390,13 +390,13 @@ def check_output_with_option(
         if (not use_gpu) and self.enable_mkldnn:
             onednn_outputs = self._get_inference_outs(
                 self._get_analysis_config(
-                    use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn
+                    use_gpu=use_gpu, use_onednn=self.enable_mkldnn
                 )
             )
 
             self.assertTrue(
                 len(paddle_outs) == len(onednn_outputs),
-                "The number of outputs is different between CPU and MKLDNN. ",
+                "The number of outputs is different between CPU and ONEDNN. ",
             )
 
             if self.enable_onednn_bfloat16:
@@ -407,7 +407,7 @@ def check_output_with_option(
                     onednn_output,
                     rtol=1e-05,
                     atol=atol,
-                    err_msg='Output has diff between CPU and MKLDNN. ',
+                    err_msg='Output has diff between CPU and ONEDNN. ',
                 )
 
     class TensorRTParam:
diff --git a/test/deprecated/legacy_test/test_batch_norm_op_deprecated.py b/test/deprecated/legacy_test/test_batch_norm_op_deprecated.py
index 9c4abf21fab0d2..bed1666fffa63b 100644
--- a/test/deprecated/legacy_test/test_batch_norm_op_deprecated.py
+++ b/test/deprecated/legacy_test/test_batch_norm_op_deprecated.py
@@ -303,7 +303,7 @@ def test_with_place(place, data_layout, shape):
                     "epsilon": epsilon,
                     "is_test": False,
                     "data_layout": data_layout,
-                    "use_mkldnn": self.use_onednn,
+                    "use_onednn": self.use_onednn,
                     "fuse_with_relu": self.fuse_with_relu,
                     "use_global_stats": self.use_global_stats,
                 }
diff --git a/test/deprecated/legacy_test/test_layer_norm_op_deprecated.py b/test/deprecated/legacy_test/test_layer_norm_op_deprecated.py
index d487569028ddea..c097e5b3ce8c70 100644
--- a/test/deprecated/legacy_test/test_layer_norm_op_deprecated.py
+++ b/test/deprecated/legacy_test/test_layer_norm_op_deprecated.py
@@ -142,10 +142,10 @@ def check_forward_backward(
         has_scale=True,
         has_bias=True,
         y_grad_scale=1.0,
-        use_mkldnn=False,
+        use_onednn=False,
     ):
         def test_with_place(
-            place, shape, begin_norm_axis, use_mkldnn=use_mkldnn
+            place, shape, begin_norm_axis, use_onednn=use_onednn
         ):
             # attr
             epsilon = 0.00001
@@ -221,7 +221,7 @@ def test_with_place(
                     attrs={
                         "epsilon": epsilon,
                         "begin_norm_axis": begin_norm_axis,
-                        "use_mkldnn": use_mkldnn,
+                        "use_onednn": use_onednn,
                     },
                 )
                 # generate backward op_desc
diff --git a/test/deprecated/legacy_test/test_program_deprecated.py b/test/deprecated/legacy_test/test_program_deprecated.py
index 5efba85dc5c0b0..582feeda7aabb2 100644
--- a/test/deprecated/legacy_test/test_program_deprecated.py
+++ b/test/deprecated/legacy_test/test_program_deprecated.py
@@ -153,7 +153,7 @@ class TestProgramProto(unittest.TestCase):
     def test_update_op(self):
         program = build_program()
         a = program.desc.serialize_to_string()
-        program.current_block().ops[0]._set_attr('use_mkldnn', True)
+        program.current_block().ops[0]._set_attr('use_onednn', True)
         self.assertTrue(program.desc.need_update())
         b = program.desc.serialize_to_string()
         self.assertFalse(a == b)
@@ -230,7 +230,7 @@ def test_program_update(self):
         hash1 = program.desc.cached_hash_str()
         id1 = id(program)
         # change mul's attr
-        program.current_block().ops[0]._set_attr('use_mkldnn', True)
+        program.current_block().ops[0]._set_attr('use_onednn', True)
         program.current_block().ops[0]._set_attr('scale_x', 2.0)
         hash2 = program.desc.cached_hash_str()
         id2 = id(program)
diff --git a/test/deprecated/mkldnn/CMakeLists.txt b/test/deprecated/mkldnn/CMakeLists.txt
index 12dfb5eb93d25b..997e554e2cd9de 100644
--- a/test/deprecated/mkldnn/CMakeLists.txt
+++ b/test/deprecated/mkldnn/CMakeLists.txt
@@ -1,12 +1,12 @@
 file(
-  GLOB TEST_MKLDNN_LISTS
+  GLOB TEST_ONEDNN_LISTS
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "test_*.py")
-string(REPLACE ".py" "" TEST_MKLDNN_LISTS "${TEST_MKLDNN_LISTS}")
+string(REPLACE ".py" "" TEST_ONEDNN_LISTS "${TEST_ONEDNN_LISTS}")
 if(WIN32)
   message(STATUS "Skip tests unrelated to onednn/mkldnn")
 elseif(WITH_ONEDNN)
-  foreach(target ${TEST_MKLDNN_LISTS})
+  foreach(target ${TEST_ONEDNN_LISTS})
     py_test_modules(${target} MODULES ${target})
     set_tests_properties(${target} PROPERTIES LABELS "RUN_TYPE=INFER" TIMEOUT
                                               120)
diff --git a/test/deprecated/mkldnn/test_activation_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_activation_mkldnn_op_deprecated.py
index 4bfa8ff2d99668..b03853ff809151 100644
--- a/test/deprecated/mkldnn/test_activation_mkldnn_op_deprecated.py
+++ b/test/deprecated/mkldnn/test_activation_mkldnn_op_deprecated.py
@@ -56,7 +56,7 @@ class TestONEDNNReluDim2(TestRelu):
     def setUp(self):
         super().setUp()
 
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
 
     def init_dtype(self):
         self.dtype = np.float32
@@ -66,7 +66,7 @@ class TestONEDNNRelu_ZeroDim(TestRelu_ZeroDim):
     def setUp(self):
         super().setUp()
 
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
 
     def init_dtype(self):
         self.dtype = np.float32
@@ -75,7 +75,7 @@ def init_dtype(self):
 class TestONEDNNRelu6Dim2(TestRelu6):
     def setUp(self):
         super().setUp()
-        self.attrs.update({"use_mkldnn": True})
+        self.attrs.update({"use_onednn": True})
         self.check_pir_onednn = False
 
     def init_dtype(self):
@@ -85,7 +85,7 @@ def init_dtype(self):
 class TestONEDNNRelu6_ZeroDim(TestRelu6_ZeroDim):
     def setUp(self):
         super().setUp()
-        self.attrs.update({"use_mkldnn": True})
+        self.attrs.update({"use_onednn": True})
         self.check_pir_onednn = False
 
     def init_dtype(self):
@@ -96,7 +96,7 @@ class TestONEDNNLeakyReluDim2(TestLeakyRelu):
     def setUp(self):
         super().setUp()
 
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
     def init_dtype(self):
@@ -117,7 +117,7 @@ class TestONEDNNLeakyRelu_ZeroDim(TestLeakyRelu_ZeroDim):
     def setUp(self):
         super().setUp()
 
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
     def init_dtype(self):
@@ -135,7 +135,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
 
@@ -150,7 +150,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
 
@@ -165,7 +165,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True, "approximate": True}
+        self.attrs = {"use_onednn": True, "approximate": True}
         self.check_pir_onednn = False
 
 
@@ -173,7 +173,7 @@ class TestONEDNNTanhDim2(TestTanh):
     def setUp(self):
         super().setUp()
 
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
     def init_dtype(self):
@@ -184,7 +184,7 @@ class TestONEDNNTanh_ZeroDim(TestTanh_ZeroDim):
     def setUp(self):
         super().setUp()
 
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
     def init_dtype(self):
@@ -195,7 +195,7 @@ class TestONEDNNSqrtDim2(TestSqrt):
     def setUp(self):
         super().setUp()
 
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
     def init_dtype(self):
@@ -206,7 +206,7 @@ class TestONEDNNSqrt_ZeroDim(TestSqrt_ZeroDim):
     def setUp(self):
         super().setUp()
 
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
     def init_dtype(self):
@@ -216,7 +216,7 @@ def init_dtype(self):
 class TestONEDNNAbsDim2(TestAbs):
     def setUp(self):
         super().setUp()
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
 
     def init_dtype(self):
         self.dtype = np.float32
@@ -226,7 +226,7 @@ class TestONEDNNAbsZeroSize(TestAbs):
     def setUp(self):
         super().setUp()
         self.check_pir_onednn = True
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
 
     def init_shape(self):
         self.shape = [0, 12, 0]
@@ -236,7 +236,7 @@ class TestONEDNNAbsZeroSize1(TestONEDNNAbsZeroSize):
     def setUp(self):
         super().setUp()
         self.check_pir_onednn = True
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
 
     def init_shape(self):
         self.shape = [0, 12, 0]
@@ -245,7 +245,7 @@ def init_shape(self):
 class TestONEDNNAbs_ZeroDim(TestAbs_ZeroDim):
     def setUp(self):
         super().setUp()
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
 
     def init_dtype(self):
         self.dtype = np.float32
@@ -255,7 +255,7 @@ class TestONEDNNSwishDim2(TestSwish):
     def setUp(self):
         super().setUp()
 
-        self.attrs["use_mkldnn"] = True
+        self.attrs["use_onednn"] = True
         self.check_pir_onednn = False
 
     def init_dtype(self):
@@ -266,7 +266,7 @@ class TestONEDNNSwish_ZeroDim(TestSwish_ZeroDim):
     def setUp(self):
         super().setUp()
 
-        self.attrs["use_mkldnn"] = True
+        self.attrs["use_onednn"] = True
         self.check_eager = False
         self.check_pir_onednn = False
 
@@ -277,27 +277,27 @@ def init_dtype(self):
 class TestONEDNNHardSwishDim2(TestHardSwish):
     def setUp(self):
         super().setUp()
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
 
 class TestONEDNNHardSwish_ZeroDim(TestHardSwish_ZeroDim):
     def setUp(self):
         super().setUp()
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
 
 class TestONEDNNSigmoidDim2(TestSigmoid):
     def setUp(self):
         super().setUp()
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
 
 
 class TestONEDNNSigmoid_ZeroDim(TestSigmoid_ZeroDim):
     def setUp(self):
         super().setUp()
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
 
 
 class TestONEDNNReluDim4(TestRelu):
@@ -311,7 +311,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
 
     def init_dtype(self):
         self.dtype = np.float32
@@ -328,7 +328,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
     def init_dtype(self):
@@ -356,7 +356,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
 
@@ -371,7 +371,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True, "approximate": True}
+        self.attrs = {"use_onednn": True, "approximate": True}
         self.check_pir_onednn = False
 
 
@@ -389,7 +389,7 @@ def setUp(self):
 
         self.inputs = {'X': convert_float_to_uint16(x)}
         self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
     def test_check_output(self):
@@ -413,7 +413,7 @@ def setUp(self):
 
         self.inputs = {'X': convert_float_to_uint16(x)}
         self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True, "approximate": True}
+        self.attrs = {"use_onednn": True, "approximate": True}
         self.check_pir_onednn = False
 
     def test_check_output(self):
@@ -431,7 +431,7 @@ def setUp(self):
             'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
         }
         self.outputs = {'Out': np.tanh(self.inputs['X'])}
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
 
@@ -443,7 +443,7 @@ def setUp(self):
             'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
         }
         self.outputs = {'Out': np.sqrt(self.inputs['X'])}
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
 
@@ -456,7 +456,7 @@ def setUp(self):
         x[np.abs(x) < 0.005] = 0.02
         self.inputs = {'X': x}
         self.outputs = {'Out': np.abs(self.inputs['X'])}
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
 
     def init_dtype(self):
         self.dtype = np.float32
@@ -487,7 +487,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
     def init_dtype(self):
@@ -505,7 +505,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
 
@@ -520,7 +520,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
 
@@ -533,7 +533,7 @@ def setUp(self):
 
         self.inputs = {'X': x}
         self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
     def test_check_output(self):
@@ -554,7 +554,7 @@ def setUp(self):
 
         self.inputs = {'X': x}
         self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
         self.check_pir_onednn = False
 
     def test_check_output(self):
@@ -574,7 +574,7 @@ def setUp(self):
         out = 1 / (1 + np.exp(-x))
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
 
 
 class TestONEDNNEluDefaultAlpha(TestActivation):
@@ -586,7 +586,7 @@ def setUp(self):
         x = np.random.random((5, 5, 4)).astype("float32")
 
         self.inputs = {'X': x}
-        self.attrs = {'use_mkldnn': True, 'alpha': self.alpha}
+        self.attrs = {'use_onednn': True, 'alpha': self.alpha}
         self.outputs = {
             'Out': np.maximum(0, x)
             + np.minimum(0, self.alpha * (np.exp(x) - 1))
@@ -606,7 +606,7 @@ def setUp(self):
         x = np.random.random(()).astype("float32")
 
         self.inputs = {'X': x}
-        self.attrs = {'use_mkldnn': True, 'alpha': self.alpha}
+        self.attrs = {'use_onednn': True, 'alpha': self.alpha}
         self.outputs = {
             'Out': np.maximum(0, x)
             + np.minimum(0, self.alpha * (np.exp(x) - 1))
@@ -629,7 +629,7 @@ def setUp(self):
         x = np.random.random((5, 5, 4)).astype("float32")
 
         self.inputs = {'X': x}
-        self.attrs = {'use_mkldnn': True}
+        self.attrs = {'use_onednn': True}
         self.outputs = {'Out': np.exp(x)}
         self.check_pir_onednn = False
 
@@ -641,7 +641,7 @@ def setUp(self):
         x = np.random.random(()).astype("float32")
 
         self.inputs = {'X': x}
-        self.attrs = {'use_mkldnn': True}
+        self.attrs = {'use_onednn': True}
         self.outputs = {'Out': np.exp(x)}
         self.check_pir_onednn = False
 
@@ -674,7 +674,7 @@ def test_check(self):
 class TestONEDNNSoftplusDim2(TestSoftplus):
     def setUp(self):
         super().setUp()
-        self.attrs.update({"use_mkldnn": True})
+        self.attrs.update({"use_onednn": True})
         self.check_pir_onednn = False
 
     def init_dtype(self):
@@ -684,7 +684,7 @@ def init_dtype(self):
 class TestONEDNNSoftplus_ZeroDim(TestSoftplus_ZeroDim):
     def setUp(self):
         super().setUp()
-        self.attrs.update({"use_mkldnn": True})
+        self.attrs.update({"use_onednn": True})
 
     def init_dtype(self):
         self.dtype = np.float32
diff --git a/test/deprecated/mkldnn/test_clip_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_clip_mkldnn_op_deprecated.py
index bd9adb38dcc865..3f30cfee0892bd 100644
--- a/test/deprecated/mkldnn/test_clip_mkldnn_op_deprecated.py
+++ b/test/deprecated/mkldnn/test_clip_mkldnn_op_deprecated.py
@@ -59,7 +59,7 @@ def adjust_op_settings(self):
         pass
 
     def set_attrs(self):
-        self.attrs = {'min': 7.2, 'max': 9.6, 'use_mkldnn': True}
+        self.attrs = {'min': 7.2, 'max': 9.6, 'use_onednn': True}
 
     def test_check_output(self):
         self.check_output(check_dygraph=False, check_pir_onednn=True)
diff --git a/test/deprecated/mkldnn/test_concat_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_concat_mkldnn_op_deprecated.py
index 59e6590b0ddec1..9bef735b1e48a5 100644
--- a/test/deprecated/mkldnn/test_concat_mkldnn_op_deprecated.py
+++ b/test/deprecated/mkldnn/test_concat_mkldnn_op_deprecated.py
@@ -32,7 +32,7 @@ def setUp(self):
         self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
         self.attrs = {
             'axis': self.axis,
-            'use_mkldnn': True,
+            'use_onednn': True,
             'mkldnn_data_type': self.onednn_data_type,
         }
 
@@ -117,7 +117,7 @@ def setUp(self):
         self.inputs = {'X': [(f'x{i}', self.x) for i in range(136)]}
         self.attrs = {
             'axis': self.axis,
-            'use_mkldnn': True,
+            'use_onednn': True,
             'mkldnn_data_type': self.onednn_data_type,
         }
 
diff --git a/test/deprecated/mkldnn/test_layer_norm_bf16_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_layer_norm_bf16_mkldnn_op_deprecated.py
index 9b656f3aa0bf85..52f03f6e3ff22a 100644
--- a/test/deprecated/mkldnn/test_layer_norm_bf16_mkldnn_op_deprecated.py
+++ b/test/deprecated/mkldnn/test_layer_norm_bf16_mkldnn_op_deprecated.py
@@ -116,7 +116,7 @@ def check_forward(
                     attrs={
                         "epsilon": epsilon,
                         "begin_norm_axis": begin_norm_axis,
-                        "use_mkldnn": True,
+                        "use_onednn": True,
                         "is_test": with_is_test,
                     },
                 )
diff --git a/test/deprecated/mkldnn/test_layer_norm_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_layer_norm_mkldnn_op_deprecated.py
index a3d56abd628405..226a7602b5c58c 100644
--- a/test/deprecated/mkldnn/test_layer_norm_mkldnn_op_deprecated.py
+++ b/test/deprecated/mkldnn/test_layer_norm_mkldnn_op_deprecated.py
@@ -126,7 +126,7 @@ def check_forward(
                     attrs={
                         "epsilon": epsilon,
                         "begin_norm_axis": begin_norm_axis,
-                        "use_mkldnn": True,
+                        "use_onednn": True,
                         "is_test": with_is_test,
                     },
                 )
diff --git a/test/deprecated/mkldnn/test_prelu_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_prelu_mkldnn_op_deprecated.py
index 304830b673fbe5..72e65827acf1a6 100644
--- a/test/deprecated/mkldnn/test_prelu_mkldnn_op_deprecated.py
+++ b/test/deprecated/mkldnn/test_prelu_mkldnn_op_deprecated.py
@@ -59,7 +59,7 @@ def setUp(self):
         self.x = np.random.random((2, 4, 5, 5)).astype("float32") + 1
         self.init_attrs()
         self.set_inputs()
-        self.attrs = {'mode': self.mode, 'use_mkldnn': True}
+        self.attrs = {'mode': self.mode, 'use_onednn': True}
         self.set_dtype_attr()
 
         self.outputs = {'Out': ref_prelu(self.x, self.alpha, self.mode)}
@@ -102,7 +102,7 @@ def setUp(self):
         self.x = np.random.random(()).astype("float32")
         self.init_attrs()
         self.set_inputs()
-        self.attrs = {'mode': self.mode, 'use_mkldnn': True}
+        self.attrs = {'mode': self.mode, 'use_onednn': True}
         self.set_dtype_attr()
 
         self.outputs = {'Out': self.x if self.x > 0 else self.x * self.alpha}
diff --git a/test/deprecated/mkldnn/test_reduce_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_reduce_mkldnn_op_deprecated.py
index 5c44e58f4f33e0..b9f52322bb95ba 100644
--- a/test/deprecated/mkldnn/test_reduce_mkldnn_op_deprecated.py
+++ b/test/deprecated/mkldnn/test_reduce_mkldnn_op_deprecated.py
@@ -26,7 +26,7 @@ def setUp(self):
         self.use_onednn = True
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
         self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
-        self.attrs = {'use_mkldnn': self.use_onednn}
+        self.attrs = {'use_onednn': self.use_onednn}
         self.check_pir_onednn = True
 
     def test_check_output(self):
@@ -53,7 +53,7 @@ def setUp(self):
         self.op_type = "reduce_sum"
         self.use_onednn = True
         self.inputs = {'X': np.random.random((5, 10, 5, 5)).astype("float32")}
-        self.attrs = {'use_mkldnn': self.use_onednn, 'dim': [2]}
+        self.attrs = {'use_onednn': self.use_onednn, 'dim': [2]}
         self.outputs = {
             'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
         }
@@ -66,7 +66,7 @@ def setUp(self):
         self.op_type = "reduce_sum"
         self.use_onednn = True
         self.inputs = {'X': np.random.random((5, 10, 5, 3)).astype("float32")}
-        self.attrs = {'use_mkldnn': self.use_onednn, 'dim': [0, 1, 2, 3]}
+        self.attrs = {'use_onednn': self.use_onednn, 'dim': [0, 1, 2, 3]}
         self.outputs = {
             'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
         }
@@ -77,7 +77,7 @@ def setUp(self):
         self.op_type = "reduce_sum"
         self.use_onednn = True
         self.inputs = {'X': np.random.random((2, 5, 3, 2, 2)).astype("float32")}
-        self.attrs = {'dim': (2, 3, 4), 'keep_dim': True, 'use_mkldnn': True}
+        self.attrs = {'dim': (2, 3, 4), 'keep_dim': True, 'use_onednn': True}
         self.outputs = {
             'Out': self.inputs['X'].sum(
                 axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim']
@@ -90,7 +90,7 @@ def setUp(self):
         self.op_type = "reduce_sum"
         self.use_onednn = True
         self.inputs = {'X': np.random.random(()).astype("float32")}
-        self.attrs = {'use_mkldnn': self.use_onednn, 'dim': []}
+        self.attrs = {'use_onednn': self.use_onednn, 'dim': []}
         self.outputs = {
             'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
         }
@@ -103,7 +103,7 @@ def setUp(self):
         self.op_type = "reduce_sum"
         self.use_onednn = True
         self.inputs = {'X': np.random.random((2, 5, 3, 2, 2)).astype("float32")}
-        self.attrs = {'reduce_all': True, 'keep_dim': True, 'use_mkldnn': True}
+        self.attrs = {'reduce_all': True, 'keep_dim': True, 'use_onednn': True}
         self.outputs = {
             'Out': self.inputs['X'].sum(keepdims=self.attrs['keep_dim'])
         }
@@ -115,7 +115,7 @@ def setUp(self):
         self.op_type = "reduce_sum"
         self.use_onednn = True
         self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
-        self.attrs = {'reduce_all': True, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'reduce_all': True, 'use_onednn': self.use_onednn}
         self.outputs = {'Out': self.inputs['X'].sum()}
         self.check_pir_onednn = False
 
@@ -131,7 +131,7 @@ def setUp(self):
         self.op_type = "reduce_sum"
         self.use_onednn = True
         self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
-        self.attrs = {'dim': (), 'use_mkldnn': self.use_onednn}
+        self.attrs = {'dim': (), 'use_onednn': self.use_onednn}
         self.outputs = {'Out': np.copy(self.inputs['X'])}
 
 
@@ -146,7 +146,7 @@ def setUp(self):
         self.op_type = "reduce_max"
         self.use_onednn = True
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
-        self.attrs = {'dim': [-1], 'use_mkldnn': self.use_onednn}
+        self.attrs = {'dim': [-1], 'use_onednn': self.use_onednn}
         self.outputs = {
             'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
         }
@@ -161,7 +161,7 @@ def setUp(self):
         self.op_type = "reduce_max"
         self.use_onednn = True
         self.inputs = {'X': np.random.random(()).astype("float32")}
-        self.attrs = {'use_mkldnn': self.use_onednn, 'dim': []}
+        self.attrs = {'use_onednn': self.use_onednn, 'dim': []}
         self.outputs = {
             'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
         }
@@ -180,7 +180,7 @@ def setUp(self):
         self.op_type = "reduce_max"
         self.use_onednn = True
         self.inputs = {'X': np.random.random((5, 6, 10, 9)).astype("float32")}
-        self.attrs = {'dim': [-1, 0, 1], 'use_mkldnn': self.use_onednn}
+        self.attrs = {'dim': [-1, 0, 1], 'use_onednn': self.use_onednn}
         self.outputs = {
             'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
         }
@@ -197,7 +197,7 @@ def setUp(self):
         self.op_type = "reduce_min"
         self.use_onednn = True
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
-        self.attrs = {'dim': [2], 'use_mkldnn': self.use_onednn}
+        self.attrs = {'dim': [2], 'use_onednn': self.use_onednn}
         self.outputs = {
             'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
         }
@@ -212,7 +212,7 @@ def setUp(self):
         self.op_type = "reduce_min"
         self.use_onednn = True
         self.inputs = {'X': np.random.random(()).astype("float32")}
-        self.attrs = {'use_mkldnn': self.use_onednn, 'dim': []}
+        self.attrs = {'use_onednn': self.use_onednn, 'dim': []}
         self.outputs = {
             'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
         }
@@ -223,7 +223,7 @@ def setUp(self):
         self.op_type = "reduce_mean"
         self.use_onednn = True
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
-        self.attrs = {'dim': [0], 'use_mkldnn': self.use_onednn}
+        self.attrs = {'dim': [0], 'use_onednn': self.use_onednn}
         self.outputs = {
             'Out': self.inputs['X'].sum(axis=0) / self.inputs['X'].shape[0]
         }
@@ -234,7 +234,7 @@ def setUp(self):
         self.op_type = "reduce_mean"
         self.use_onednn = True
         self.inputs = {'X': np.random.random(()).astype("float32")}
-        self.attrs = {'use_mkldnn': self.use_onednn, 'dim': []}
+        self.attrs = {'use_onednn': self.use_onednn, 'dim': []}
         self.outputs = {
             # scalar mean is equal to sum
             'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
@@ -246,7 +246,7 @@ def setUp(self):
         self.op_type = "reduce_mean"
         self.use_onednn = True
         self.inputs = {'X': np.random.random((5, 6, 8, 10)).astype("float32")}
-        self.attrs = {'reduce_all': True, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'reduce_all': True, 'use_onednn': self.use_onednn}
         self.outputs = {
             'Out': self.inputs['X'].sum()
             / np.asarray(self.inputs['X'].shape).prod()
diff --git a/test/deprecated/mkldnn/test_reshape_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_reshape_mkldnn_op_deprecated.py
index be2c1c948a19cd..8f48abd784a29d 100644
--- a/test/deprecated/mkldnn/test_reshape_mkldnn_op_deprecated.py
+++ b/test/deprecated/mkldnn/test_reshape_mkldnn_op_deprecated.py
@@ -36,7 +36,7 @@ def setUp(self):
             'XShape': np.random.random(self.ori_shape).astype("float32"),
         }
         self.x = self.inputs["X"]
-        self.attrs['use_mkldnn'] = True
+        self.attrs['use_onednn'] = True
         self.set_additional_inputs()
         self.set_outputs()
 
@@ -208,7 +208,7 @@ def setUp(self):
             super().setUp()
             self.dtype = np.uint16
             self.inputs = {"X": convert_float_to_uint16(self.x)}
-            self.attrs['use_mkldnn'] = True
+            self.attrs['use_onednn'] = True
 
         def calculate_grads(self):
             self.dout = self.outputs['Out']
diff --git a/test/deprecated/mkldnn/test_scale_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_scale_mkldnn_op_deprecated.py
index 1d50d92e8e4581..9570bb2091edb8 100644
--- a/test/deprecated/mkldnn/test_scale_mkldnn_op_deprecated.py
+++ b/test/deprecated/mkldnn/test_scale_mkldnn_op_deprecated.py
@@ -25,7 +25,7 @@ def setUp(self):
         self.init_shape()
         self.op_type = "scale"
         self.inputs = {'X': np.random.random(self.shape).astype(np.float32)}
-        self.attrs = {'scale': -2.3, 'use_mkldnn': True, 'bias': 0.2}
+        self.attrs = {'scale': -2.3, 'use_onednn': True, 'bias': 0.2}
         self.use_onednn = True
         self.outputs = {
             'Out': (self.inputs['X'] * self.attrs['scale']) + self.attrs['bias']
@@ -54,7 +54,7 @@ def setUp(self):
         self.inputs = {'X': np.random.random((10, 10)).astype(np.float32)}
         self.attrs = {
             'scale': 1.5,
-            'use_mkldnn': True,
+            'use_onednn': True,
             'bias': 2.3,
             'bias_after_scale': False,
         }
diff --git a/test/deprecated/mkldnn/test_softmax_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_softmax_mkldnn_op_deprecated.py
index 6056535c6d9eb2..645d1e675e6bad 100644
--- a/test/deprecated/mkldnn/test_softmax_mkldnn_op_deprecated.py
+++ b/test/deprecated/mkldnn/test_softmax_mkldnn_op_deprecated.py
@@ -67,7 +67,7 @@ def setUp(self):
         self.attrs = {
             'axis': self.axis,
             'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
 
     def test_check_output(self):
diff --git a/test/deprecated/mkldnn/test_split_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_split_mkldnn_op_deprecated.py
index 3a01f29aa0d305..95d65ed46e8699 100644
--- a/test/deprecated/mkldnn/test_split_mkldnn_op_deprecated.py
+++ b/test/deprecated/mkldnn/test_split_mkldnn_op_deprecated.py
@@ -52,7 +52,7 @@ def setUp(self):
         self.init_data_type()
         self.init_test_case()
         self.inputs = {'X': self.x}
-        self.attrs = {'use_mkldnn': True, 'num': self.num}
+        self.attrs = {'use_onednn': True, 'num': self.num}
 
         if self.axis is not None:
             self.attrs['axis'] = self.axis
diff --git a/test/deprecated/mkldnn/test_sum_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_sum_mkldnn_op_deprecated.py
index a00e1c6096757d..3ca84284f7f3f6 100644
--- a/test/deprecated/mkldnn/test_sum_mkldnn_op_deprecated.py
+++ b/test/deprecated/mkldnn/test_sum_mkldnn_op_deprecated.py
@@ -32,7 +32,7 @@ def setUp(self):
         self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
         y = x0 + x1 + x2
         self.outputs = {'Out': y}
-        self.attrs = {'use_mkldnn': self.use_onednn}
+        self.attrs = {'use_onednn': self.use_onednn}
 
     def init_data_type(self):
         self.dtype = np.float32
@@ -73,7 +73,7 @@ def test_check_output(self):
                 tensor.set(var_value, place)
 
         sum_op = Operator(
-            "sum", X=["x0", "x1"], Out=out_var_name, use_mkldnn=True
+            "sum", X=["x0", "x1"], Out=out_var_name, use_onednn=True
         )
         expected_out = np.array(self.x0 + self.x1)
         sum_op.run(scope, place)
diff --git a/test/deprecated/quantization/CMakeLists.txt b/test/deprecated/quantization/CMakeLists.txt
index c5b4d9d3a67137..dbf0dbd0806a43 100644
--- a/test/deprecated/quantization/CMakeLists.txt
+++ b/test/deprecated/quantization/CMakeLists.txt
@@ -5,13 +5,13 @@ file(
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 function(_inference_analysis_python_api_int8_test target model_dir data_path
-         filename use_mkldnn)
+         filename use_onednn)
   py_test(
     ${target}
     SRCS ${filename}
          ENVS
          CPU_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-         FLAGS_use_onednn=${use_mkldnn}
+         FLAGS_use_onednn=${use_onednn}
          ARGS
          --infer_model
          ${model_dir}/model
@@ -207,7 +207,7 @@ if(NOT WITH_GPU)
   list(REMOVE_ITEM TEST_OPS test_apply_per_channel_scale)
 endif()
 
-if(LINUX AND WITH_MKLDNN)
+if(LINUX AND WITH_ONEDNN)
 
   #### Image classification dataset: ImageNet (small)
   # The dataset should already be downloaded for INT8v2 unit tests
diff --git a/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py b/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py
index d7221b53ecbd50..2a73ad7154f4fe 100644
--- a/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py
+++ b/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py
@@ -64,7 +64,7 @@ def prepare_program_mul(self, program):
             type=self.op_name(),
             inputs={"X": block.var('mul_input'), "Y": block.var('mul_weights')},
             outputs={"Out": block.var('mul_output')},
-            attrs={'use_mkldnn': self.use_onednn},
+            attrs={'use_onednn': self.use_onednn},
         )
 
     def test_dequantize_op_weights(self):
@@ -179,7 +179,7 @@ def prepare_program_conv2d(self, program):
                 'groups': self.groups,
                 'dilations': self.dilations,
                 'use_cudnn': self.use_cudnn,
-                'use_mkldnn': self.use_onednn,
+                'use_onednn': self.use_onednn,
                 'data_format': self.data_format,
                 'fuse_relu': True,
             },
@@ -197,7 +197,7 @@ def prepare_program_conv2d(self, program):
                 'groups': self.groups,
                 'dilations': self.dilations,
                 'use_cudnn': self.use_cudnn,
-                'use_mkldnn': self.use_onednn,
+                'use_onednn': self.use_onednn,
                 'data_format': self.data_format,
             },
         )
@@ -312,7 +312,7 @@ def prepare_program(self, program):
                     'groups': self.groups,
                     'dilations': self.dilations,
                     'use_cudnn': self.use_cudnn,
-                    'use_mkldnn': self.use_onednn,
+                    'use_onednn': self.use_onednn,
                     'data_format': self.data_format,
                     'fuse_relu': True,
                 },
@@ -329,7 +329,7 @@ def prepare_program(self, program):
                     'out_w': self.out_w,
                     'scale': self.scale,
                     'data_layout': self.data_layout,
-                    'use_mkldnn': self.use_onednn,
+                    'use_onednn': self.use_onednn,
                 },
             )
             block.append_op(
diff --git a/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py b/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py
index addd9aad1179b9..2100bdccaa4857 100644
--- a/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py
+++ b/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py
@@ -60,7 +60,7 @@ def conv_net(img, label):
     return avg_loss
 
 
-class TestMKLDNNTransformBasedFreezePass(unittest.TestCase):
+class TestONEDNNTransformBasedFreezePass(unittest.TestCase):
     def setUp(self):
         self.quantizable_op_and_inputs = {
             'conv2d': ['Input', 'Filter'],
diff --git a/test/dygraph_to_static/simnet_dygraph_model.py b/test/dygraph_to_static/simnet_dygraph_model.py
index 35262bd77e8397..a3e19de4cc3670 100644
--- a/test/dygraph_to_static/simnet_dygraph_model.py
+++ b/test/dygraph_to_static/simnet_dygraph_model.py
@@ -410,7 +410,7 @@ def forward(self, input):
                 type="sum",
                 inputs={"X": mul_results},
                 outputs={"Out": pre_bias},
-                attrs={"use_mkldnn": False},
+                attrs={"use_onednn": False},
             )
 
         if self._b is not None:
diff --git a/test/ir/inference/inference_pass_test.py b/test/ir/inference/inference_pass_test.py
index 34bdfb4d2c16c5..ae823dfeea9ad9 100644
--- a/test/ir/inference/inference_pass_test.py
+++ b/test/ir/inference/inference_pass_test.py
@@ -37,7 +37,7 @@ def __init__(self, methodName='runTest'):
         self.feeds = None
         self.fetch_list = None
 
-        self.enable_mkldnn = False
+        self.enable_onednn = False
         self.enable_onednn_bfloat16 = False
         self.enable_trt = False
         self.enable_tensorrt_varseqlen = False
@@ -130,7 +130,7 @@ def _get_inference_outs(self, config):
         return outs
 
     def _get_analysis_config(
-        self, use_gpu=False, use_trt=False, use_mkldnn=False
+        self, use_gpu=False, use_trt=False, use_onednn=False
     ):
         '''
         Return a new object of AnalysisConfig.
@@ -178,7 +178,7 @@ def _get_analysis_config(
                 if self.enable_tensorrt_varseqlen:
                     config.enable_tensorrt_varseqlen()
 
-        elif use_mkldnn:
+        elif use_onednn:
             config.enable_onednn()
             if self.enable_onednn_bfloat16:
                 config.enable_onednn_bfloat16()
@@ -286,10 +286,10 @@ def check_output_with_option(
                 )
 
         # Check whether the onednn results and the CPU results are the same.
-        if (not use_gpu) and self.enable_mkldnn:
+        if (not use_gpu) and self.enable_onednn:
             onednn_outputs = self._get_inference_outs(
                 self._get_analysis_config(
-                    use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn
+                    use_gpu=use_gpu, use_onednn=self.enable_onednn
                 )
             )
 
diff --git a/test/ir/inference/quant_dequant_test.py b/test/ir/inference/quant_dequant_test.py
index f955273a88667f..1091e0282fb74a 100644
--- a/test/ir/inference/quant_dequant_test.py
+++ b/test/ir/inference/quant_dequant_test.py
@@ -46,7 +46,7 @@ def __init__(self, methodName='runTest'):
         self.test_startup_program = paddle.static.Program()
         self.feeds = None
         self.fetch_list = None
-        self.enable_mkldnn = False
+        self.enable_onednn = False
         self.enable_onednn_bfloat16 = False
         self.enable_trt = False
         self.enable_tensorrt_varseqlen = True
@@ -190,7 +190,7 @@ def _get_inference_outs(self, config):
         return outs
 
     def _get_analysis_config(
-        self, use_gpu=False, use_trt=False, use_mkldnn=False
+        self, use_gpu=False, use_trt=False, use_onednn=False
     ):
         '''
         Return a new object of AnalysisConfig.
@@ -230,7 +230,7 @@ def _get_analysis_config(
                 if self.enable_tensorrt_varseqlen:
                     config.enable_tensorrt_varseqlen()
 
-        elif use_mkldnn:
+        elif use_onednn:
             config.enable_onednn()
             if self.enable_onednn_bfloat16:
                 config.enable_onednn_bfloat16()
@@ -388,10 +388,10 @@ def check_output_with_option(
                 )
 
         # Check whether the onednn results and the CPU results are the same.
-        if (not use_gpu) and self.enable_mkldnn:
+        if (not use_gpu) and self.enable_onednn:
             onednn_outputs = self._get_inference_outs(
                 self._get_analysis_config(
-                    use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn
+                    use_gpu=use_gpu, use_onednn=self.enable_onednn
                 )
             )
 
diff --git a/test/ir/inference/test_conv_act_onednn_fuse_pass.py b/test/ir/inference/test_conv_act_onednn_fuse_pass.py
index 8392b19875abfa..4c7b0d2e1cc5aa 100755
--- a/test/ir/inference/test_conv_act_onednn_fuse_pass.py
+++ b/test/ir/inference/test_conv_act_onednn_fuse_pass.py
@@ -207,7 +207,7 @@ def sample_program_config(self, draw):
             groups=groups,
             dilations=dilations,
             data_format=data_format,
-            use_mkldnn=True,
+            use_onednn=True,
         )
 
         ops = [conv2d_op, act_op]
diff --git a/test/ir/inference/test_conv_bn_fuse_pass.py b/test/ir/inference/test_conv_bn_fuse_pass.py
index 9cfd09d53ca9e7..d4861008858257 100644
--- a/test/ir/inference/test_conv_bn_fuse_pass.py
+++ b/test/ir/inference/test_conv_bn_fuse_pass.py
@@ -108,7 +108,7 @@ def generate_bn_Var():
             groups=groups,
             paddings=paddings,
             strides=strides,
-            use_mkldnn=use_onednn,
+            use_onednn=use_onednn,
             has_bias=False,
             is_test=True,
         )
@@ -158,7 +158,7 @@ def generate_bn_Var():
 
     def sample_predictor_configs(self, program_config):
         # for onednn
-        if program_config.ops[0].attrs['use_mkldnn']:
+        if program_config.ops[0].attrs['use_onednn']:
             config = self.create_inference_config(use_onednn=True)
             yield config, ['fused_conv2d'], (1e-5, 1e-5)
         else:
diff --git a/test/ir/inference/test_conv_transpose_bn_fuse_pass.py b/test/ir/inference/test_conv_transpose_bn_fuse_pass.py
index 31e9bc98973814..99fddb614697ef 100644
--- a/test/ir/inference/test_conv_transpose_bn_fuse_pass.py
+++ b/test/ir/inference/test_conv_transpose_bn_fuse_pass.py
@@ -67,7 +67,7 @@ def sample_program_config(self, draw):
             st.sampled_from(["EXPLICIT", "SAME", "VALID"])
         )
         random_data_layout = draw(st.sampled_from(["NCHW", "NHWC"]))
-        random_use_mkldnn = draw(st.booleans())
+        random_use_onednn = draw(st.booleans())
         random_output_size = []
         random_filter = draw(
             st.lists(
@@ -133,7 +133,7 @@ def generate_batch_norm_Variance():
                 'data_format': random_data_layout,
                 'output_size': random_output_size,
                 'output_padding': random_output_size,
-                'use_mkldnn': random_use_mkldnn,
+                'use_mkldnn': random_use_onednn,
                 'is_test': True,
             },
         )
@@ -160,7 +160,7 @@ def generate_batch_norm_Variance():
                 'is_test': True,
                 'trainable_statistics': False,
                 'data_layout': random_data_layout,
-                'use_mkldnn': random_use_mkldnn,
+                'use_mkldnn': random_use_onednn,
             },
         )
 
diff --git a/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py b/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
index 50b19a7ffba3a4..216b661156b76e 100644
--- a/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
+++ b/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
@@ -71,7 +71,7 @@ def sample_program_config(self, draw):
             st.sampled_from(["EXPLICIT", "SAME", "VALID"])
         )
         random_data_layout = draw(st.sampled_from(["NCHW", "NHWC"]))
-        random_use_mkldnn = draw(st.booleans())
+        random_use_onednn = draw(st.booleans())
         random_output_size = []
         random_filter = draw(
             st.lists(
@@ -141,7 +141,7 @@ def generate_batch_norm_Variance():
                 'data_format': random_data_layout,
                 'output_size': random_output_size,
                 'output_padding': random_output_size,
-                'use_mkldnn': random_use_mkldnn,
+                'use_mkldnn': random_use_onednn,
                 'is_test': True,
             },
         )
@@ -182,7 +182,7 @@ def generate_batch_norm_Variance():
                 'is_test': True,
                 'trainable_statistics': False,
                 'data_layout': random_data_layout,
-                'use_mkldnn': random_use_mkldnn,
+                'use_mkldnn': random_use_onednn,
             },
         )
 
diff --git a/test/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py b/test/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
index 80cd83e79f8338..cd01ad161725ae 100644
--- a/test/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
+++ b/test/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
@@ -99,7 +99,7 @@ def sample_program_config(self, draw):
             padding_weights=False,
             activation_type="",
             use_quantizer=False,
-            use_mkldnn=False,
+            use_onednn=False,
         )
         add_op = OpConfig(
             "elementwise_add",
diff --git a/test/ir/inference/test_mkldnn_int8_scale_calculation_pass.py b/test/ir/inference/test_mkldnn_int8_scale_calculation_pass.py
index 6c61d24ac269f8..456a0781118b54 100644
--- a/test/ir/inference/test_mkldnn_int8_scale_calculation_pass.py
+++ b/test/ir/inference/test_mkldnn_int8_scale_calculation_pass.py
@@ -123,7 +123,7 @@ def sample_program_config(self, draw):
         bias_shape = [f_shape[0]]
         inputs = {}
         weights = {}
-        use_mkldnn = True
+        use_onednn = True
 
         has_bias = draw(st.booleans())
         if has_bias:
@@ -154,7 +154,7 @@ def sample_program_config(self, draw):
             groups=groups,
             dilations=dilations,
             data_format=data_format,
-            use_mkldnn=use_mkldnn,
+            use_onednn=use_onednn,
             mkldnn_data_type="int8",
         )
 
diff --git a/test/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
index eb73fa54ae6806..e53c32bcdaf298 100644
--- a/test/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
@@ -140,7 +140,7 @@ def generate_input(type):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True,
+            use_onednn=True,
             passes=[
                 'matmul_activation_onednn_fuse_pass',
                 'operator_scale_onednn_fuse_pass',
diff --git a/test/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
index 278b2b4102cf2d..252378c60b36d5 100644
--- a/test/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py
@@ -131,7 +131,7 @@ def generate_input():
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True,
+            use_onednn=True,
             passes=[
                 'matmul_elementwise_add_onednn_fuse_pass',
                 'matmul_activation_onednn_fuse_pass',
diff --git a/test/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
index 0f9db3a18eadb7..96b978d88c5cf7 100644
--- a/test/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py
@@ -74,7 +74,7 @@ def generate_input():
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True, passes=['matmul_elementwise_add_onednn_fuse_pass']
+            use_onednn=True, passes=['matmul_elementwise_add_onednn_fuse_pass']
         )
         yield config, ['fused_matmul'], (1e-5, 1e-5)
 
@@ -137,7 +137,7 @@ def generate_input():
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True, passes=['matmul_elementwise_add_onednn_fuse_pass']
+            use_onednn=True, passes=['matmul_elementwise_add_onednn_fuse_pass']
         )
         yield config, ['fused_matmul'], (1e-5, 1e-5)
 
@@ -203,7 +203,7 @@ def generate_input_redisual():
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True, passes=['matmul_elementwise_add_onednn_fuse_pass']
+            use_onednn=True, passes=['matmul_elementwise_add_onednn_fuse_pass']
         )
         yield config, ['fused_matmul'], (1e-5, 1e-5)
 
diff --git a/test/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py
index 7ac863e675ac7c..017b7387e5c45f 100644
--- a/test/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py
@@ -144,7 +144,7 @@ def generate_input(type):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True,
+            use_onednn=True,
             passes=[
                 'matmul_activation_onednn_fuse_pass',
                 'operator_scale_onednn_fuse_pass',
diff --git a/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py
index d6be1efaa34353..cf383495f52c42 100644
--- a/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py
@@ -53,7 +53,7 @@ def generate_input(type):
             type='matmul_v2',
             inputs={'X': ['matmul_X'], 'Y': ['matmul_Y']},
             outputs={'Out': ['matmul_output']},
-            attrs={'use_mkldnn': True},
+            attrs={'use_onednn': True},
         )
 
         if matmul_as_x:
@@ -65,7 +65,7 @@ def generate_input(type):
             type='elementwise_add',
             inputs=inputs,
             outputs={'Out': ['elementwise_add_output']},
-            attrs={'axis': axis, 'use_mkldnn': True},
+            attrs={'axis': axis, 'use_onednn': True},
         )
 
         model_net = [matmul_op, elt_add_op]
diff --git a/test/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py b/test/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
index b4181a7e6580e0..0d86d8385d0c28 100644
--- a/test/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
@@ -137,7 +137,7 @@ def generate_input(attrs, type):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True, passes=['scale_matmul_fuse_pass']
+            use_onednn=True, passes=['scale_matmul_fuse_pass']
         )
         yield config, ['matmul'], (1e-5, 1e-5)
 
diff --git a/test/ir/inference/test_onednn_conv_bias_fuse_pass.py b/test/ir/inference/test_onednn_conv_bias_fuse_pass.py
index ac82c4997da3af..3a1435ad0bc0a8 100644
--- a/test/ir/inference/test_onednn_conv_bias_fuse_pass.py
+++ b/test/ir/inference/test_onednn_conv_bias_fuse_pass.py
@@ -130,7 +130,7 @@ def sample_program_config(self, draw):
         conv_bias_shape = []
         inputs = {}
         weights = {}
-        use_mkldnn = None
+        use_onednn = None
         conv_type = 'conv2d'
         if draw(st.booleans()):
             conv_bias_shape = [f_shape[0]]
@@ -145,7 +145,7 @@ def sample_program_config(self, draw):
                 'bias': TensorConfig(shape=bias_shape),
                 'conv_bias': TensorConfig(shape=conv_bias_shape),
             }
-            use_mkldnn = True
+            use_onednn = True
         else:
             inputs = {
                 'Input': ['input_x'],
@@ -155,7 +155,7 @@ def sample_program_config(self, draw):
                 'filter': TensorConfig(shape=f_shape),
                 'bias': TensorConfig(shape=bias_shape),
             }
-            use_mkldnn = False
+            use_onednn = False
 
         conv2d_op = OpConfig(
             conv_type,
@@ -167,7 +167,7 @@ def sample_program_config(self, draw):
             groups=groups,
             dilations=dilations,
             data_format=data_format,
-            use_mkldnn=use_mkldnn,
+            use_onednn=use_onednn,
         )
 
         add_op = OpConfig(
diff --git a/test/ir/inference/test_onednn_conv_bn_fuse_pass.py b/test/ir/inference/test_onednn_conv_bn_fuse_pass.py
index da95b32fcda80b..18a4da54a54464 100644
--- a/test/ir/inference/test_onednn_conv_bn_fuse_pass.py
+++ b/test/ir/inference/test_onednn_conv_bn_fuse_pass.py
@@ -23,7 +23,7 @@
 
 class TestOneDNNConvBnFusePass(PassAutoScanTest):
     def sample_program_config(self, draw):
-        use_mkldnn = True
+        use_onednn = True
         padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
         groups = draw(st.integers(min_value=1, max_value=3))
         data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
@@ -78,7 +78,7 @@ def generate_data(shape):
             groups=groups,
             paddings=paddings,
             strides=strides,
-            use_mkldnn=use_mkldnn,
+            use_onednn=use_onednn,
             has_bias=False,
             is_test=True,
         )
diff --git a/test/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py b/test/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py
index 3d396968a76018..3cf14d3c772c2c 100644
--- a/test/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py
+++ b/test/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py
@@ -116,7 +116,7 @@ def generate_input():
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True,
+            use_onednn=True,
             passes=[
                 'elementwise_act_onednn_fuse_pass',
                 'operator_scale_onednn_fuse_pass',
diff --git a/test/ir/inference/test_onednn_fc_activation_fuse_pass.py b/test/ir/inference/test_onednn_fc_activation_fuse_pass.py
index 84517b6dfc8546..01923c2c3031f2 100644
--- a/test/ir/inference/test_onednn_fc_activation_fuse_pass.py
+++ b/test/ir/inference/test_onednn_fc_activation_fuse_pass.py
@@ -134,7 +134,7 @@ def generate_input(shape):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True,
+            use_onednn=True,
             passes=[
                 "fc_act_onednn_fuse_pass",
                 "operator_scale_onednn_fuse_pass",
diff --git a/test/ir/inference/test_onednn_fc_gru_fuse_pass.py b/test/ir/inference/test_onednn_fc_gru_fuse_pass.py
index 1b2d7b0be6e4f5..069ed1fe44169d 100644
--- a/test/ir/inference/test_onednn_fc_gru_fuse_pass.py
+++ b/test/ir/inference/test_onednn_fc_gru_fuse_pass.py
@@ -103,7 +103,7 @@ def generate_input(shape):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True,
+            use_onednn=True,
             passes=[
                 'onednn_placement_pass',
                 'fc_gru_fuse_pass',
diff --git a/test/ir/inference/test_onednn_fc_lstm_fuse_pass.py b/test/ir/inference/test_onednn_fc_lstm_fuse_pass.py
index 93e755f4032ff3..933c3477ea8330 100644
--- a/test/ir/inference/test_onednn_fc_lstm_fuse_pass.py
+++ b/test/ir/inference/test_onednn_fc_lstm_fuse_pass.py
@@ -107,7 +107,7 @@ def generate_data(shape):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True,
+            use_onednn=True,
             passes=[
                 'onednn_placement_pass',
                 'fc_lstm_fuse_pass',
diff --git a/test/ir/inference/test_onednn_multi_gru_fuse_pass.py b/test/ir/inference/test_onednn_multi_gru_fuse_pass.py
index 1133504a149caa..9a5dbbf2273a8a 100644
--- a/test/ir/inference/test_onednn_multi_gru_fuse_pass.py
+++ b/test/ir/inference/test_onednn_multi_gru_fuse_pass.py
@@ -121,7 +121,7 @@ def generate_bias(shape):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True,
+            use_onednn=True,
             passes=['multi_gru_fuse_pass'],
         )
         yield config, ['multi_gru'], (1e-5, 1e-5)
diff --git a/test/ir/inference/test_onednn_multi_gru_seq_fuse_pass.py b/test/ir/inference/test_onednn_multi_gru_seq_fuse_pass.py
index dbb1439dda96cb..43a7f1952c8bd1 100644
--- a/test/ir/inference/test_onednn_multi_gru_seq_fuse_pass.py
+++ b/test/ir/inference/test_onednn_multi_gru_seq_fuse_pass.py
@@ -196,7 +196,7 @@ def generate_bias(shape):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True,
+            use_onednn=True,
             passes=['multi_gru_fuse_pass', 'multi_gru_seq_fuse_pass'],
         )
         yield config, ['multi_gru'], (1e-5, 1e-5)
diff --git a/test/ir/inference/test_onednn_operator_reshape2_fuse_pass.py b/test/ir/inference/test_onednn_operator_reshape2_fuse_pass.py
index 758950be6ee678..abd8f90f099632 100644
--- a/test/ir/inference/test_onednn_operator_reshape2_fuse_pass.py
+++ b/test/ir/inference/test_onednn_operator_reshape2_fuse_pass.py
@@ -75,7 +75,7 @@ def generate_input(shape):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True,
+            use_onednn=True,
             passes=[
                 "operator_reshape2_onednn_fuse_pass",
             ],
diff --git a/test/ir/inference/test_onednn_operator_unsqueeze2_fuse_pass.py b/test/ir/inference/test_onednn_operator_unsqueeze2_fuse_pass.py
index d1f441f3444cab..f35c355eb0314f 100644
--- a/test/ir/inference/test_onednn_operator_unsqueeze2_fuse_pass.py
+++ b/test/ir/inference/test_onednn_operator_unsqueeze2_fuse_pass.py
@@ -73,7 +73,7 @@ def generate_input(shape):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True,
+            use_onednn=True,
             passes=[
                 "operator_unsqueeze2_onednn_fuse_pass",
             ],
@@ -138,7 +138,7 @@ def generate_input(shape):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True,
+            use_onednn=True,
             passes=[
                 "operator_unsqueeze2_onednn_fuse_pass",
             ],
diff --git a/test/ir/inference/test_onednn_quant_transpose_dequant_fuse_pass.py b/test/ir/inference/test_onednn_quant_transpose_dequant_fuse_pass.py
index 5c8f89bd5f8063..1ffcbf37b1054f 100644
--- a/test/ir/inference/test_onednn_quant_transpose_dequant_fuse_pass.py
+++ b/test/ir/inference/test_onednn_quant_transpose_dequant_fuse_pass.py
@@ -65,7 +65,7 @@ def generate_input():
                 'use_mkldnn': True,
                 'mkldnn_data_type': 'int8',
             },
-            use_mkldnn=True,
+            use_onednn=True,
         )
 
         transpose2_op_2 = OpConfig(
@@ -80,7 +80,7 @@ def generate_input():
                 'use_mkldnn': True,
                 'mkldnn_data_type': 'int8',
             },
-            use_mkldnn=True,
+            use_onednn=True,
         )
 
         dequantize_op = OpConfig(
@@ -106,7 +106,7 @@ def generate_input():
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True,
+            use_onednn=True,
             passes=['quant_transpose2_dequant_onednn_fuse_pass'],
         )
         yield config, ['fused_transpose', 'fused_transpose'], (1e-5, 1e-5)
diff --git a/test/ir/inference/test_onednn_squeeze2_transpose2_fuse_pass.py b/test/ir/inference/test_onednn_squeeze2_transpose2_fuse_pass.py
index 3387f244bd4e8d..3b6f86d7d027dc 100644
--- a/test/ir/inference/test_onednn_squeeze2_transpose2_fuse_pass.py
+++ b/test/ir/inference/test_onednn_squeeze2_transpose2_fuse_pass.py
@@ -78,7 +78,7 @@ def generate_input(shape):
 
     def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
-            use_mkldnn=True,
+            use_onednn=True,
             passes=[
                 "squeeze2_transpose2_onednn_fuse_pass",
             ],
diff --git a/test/legacy_test/hygon_dcu/hygon_llama_ops.py b/test/legacy_test/hygon_dcu/hygon_llama_ops.py
index c6f0d6d20aa38d..4ead7b15c39028 100644
--- a/test/legacy_test/hygon_dcu/hygon_llama_ops.py
+++ b/test/legacy_test/hygon_dcu/hygon_llama_ops.py
@@ -480,7 +480,7 @@ def setUp(self):
         self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
         y = x0 + x1 + x2
         self.outputs = {'Out': y}
-        self.attrs = {'use_mkldnn': self.use_onednn}
+        self.attrs = {'use_onednn': self.use_onednn}
 
     def init_kernel_type(self):
         self.dtype = np.float16
@@ -545,7 +545,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
         self.outputs = {'Out': self.out}
 
     def init_kernel_type(self):
@@ -631,7 +631,7 @@ def setUp(self):
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
         self.outputs = {'Out': self.out}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
 
     def init_kernel_type(self):
         self.use_onednn = False
diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py
index e6eca9654f330e..3a5d26c93b9516 100644
--- a/test/legacy_test/op_test.py
+++ b/test/legacy_test/op_test.py
@@ -633,8 +633,10 @@ def is_float16_op(self):
     def is_onednn_op(self):
         return (hasattr(self, "use_onednn") and self.use_onednn) or (
             hasattr(self, "attrs")
-            and "use_mkldnn" in self.attrs
-            and self.attrs["use_mkldnn"]
+            and (
+                ("use_mkldnn" in self.attrs and self.attrs["use_mkldnn"])
+                or ("use_onednn" in self.attrs and self.attrs["use_onednn"])
+            )
         )
 
     def is_xpu_op(self):
@@ -2198,7 +2200,10 @@ def check_inplace_output_with_place(
                 attrs_use_mkldnn = hasattr(self, 'attrs') and bool(
                     self.attrs.get('use_mkldnn', False)
                 )
-                if flags_use_onednn or attrs_use_mkldnn:
+                attrs_use_onednn = hasattr(self, 'attrs') and bool(
+                    self.attrs.get('use_onednn', False)
+                )
+                if flags_use_onednn or attrs_use_mkldnn or attrs_use_onednn:
                     warnings.warn(
                         "check inplace_grad for ops using mkldnn is not supported"
                     )
@@ -3441,9 +3446,13 @@ def check_grad_with_place(
             cache_list = self.cache_name_list
 
         # oneDNN numeric gradient should use CPU kernel
-        use_onednn = False
+        use_mkldnn = False
         if op_attrs.get("use_mkldnn"):
             op_attrs["use_mkldnn"] = False
+            use_mkldnn = True
+        use_onednn = False
+        if op_attrs.get("use_onednn"):
+            op_attrs["use_onednn"] = False
             use_onednn = True
         if hasattr(self, "attrs"):
             for k, v in self.attrs.items():
@@ -3459,8 +3468,10 @@ def check_grad_with_place(
             cache_list=cache_list,
         )
 
-        if use_onednn:
+        if use_mkldnn:
             op_attrs["use_mkldnn"] = True
+        if use_onednn:
+            op_attrs["use_onednn"] = True
 
         if no_grad_set is None:
             no_grad_set = set()
diff --git a/test/legacy_test/test_batch_norm_op.py b/test/legacy_test/test_batch_norm_op.py
index b8525403e59876..556a3637791e34 100644
--- a/test/legacy_test/test_batch_norm_op.py
+++ b/test/legacy_test/test_batch_norm_op.py
@@ -317,7 +317,7 @@ def check_with_place(self, place, data_layout, dtype, shape):
             # attrs
             is_test=True,
             data_layout=data_layout,
-            use_mkldnn=self.use_onednn,
+            use_onednn=self.use_onednn,
             fuse_with_relu=self.fuse_with_relu,
             epsilon=epsilon,
         )
diff --git a/test/legacy_test/test_broadcast_tensors_op.py b/test/legacy_test/test_broadcast_tensors_op.py
index 296aea9b007e3e..dfac9d35108a77 100644
--- a/test/legacy_test/test_broadcast_tensors_op.py
+++ b/test/legacy_test/test_broadcast_tensors_op.py
@@ -112,7 +112,7 @@ def set_dtype(self):
     def setUp(self):
         self.op_type = "broadcast_tensors"
         self.use_onednn = False
-        self.attrs = {'use_mkldnn': self.use_onednn}
+        self.attrs = {'use_onednn': self.use_onednn}
         self.test_gen_func_list = [
             gen_rank_diff_test,
             gen_no_broadcast_test,
@@ -198,7 +198,7 @@ def setUp(self):
         self.dtype = np.uint16
         self.np_dtype = "float32"
         self.use_onednn = False
-        self.attrs = {'use_mkldnn': self.use_onednn}
+        self.attrs = {'use_onednn': self.use_onednn}
         self.test_gen_func_list = [
             gen_rank_diff_test,
             gen_no_broadcast_test,
diff --git a/test/legacy_test/test_compat_minmax.py b/test/legacy_test/test_compat_minmax.py
new file mode 100644
index 00000000000000..00245894df0480
--- /dev/null
+++ b/test/legacy_test/test_compat_minmax.py
@@ -0,0 +1,386 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class TestCompatMinMaxBase(unittest.TestCase):
+    """The default base class is for testing min-related ops"""
+
+    def __init__(
+        self,
+        *args,
+        test_op=paddle.compat.min,
+        origin_op=paddle.min,
+        index_op=paddle.argmin,
+        test_op_name="paddle.compat.min",
+        origin_op_name="paddle.min",
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        paddle.disable_static()
+        self.test_op = test_op
+        self.origin_op = origin_op
+        self.index_op = index_op
+        self.test_op_name = test_op_name
+        self.origin_op_name = origin_op_name
+
+    def test_case1_simple_reduce_all(self):
+        data = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0]], dtype='float32')
+        val = self.test_op(data)
+
+        if self.test_op_name.endswith("min"):
+            self.assertAlmostEqual(val.item(), 1.0)
+            expected_grad = np.array([[0.5, 0.5], [0.0, 0.0]])
+        else:
+            self.assertAlmostEqual(val.item(), 4.0)
+            expected_grad = np.array([[0.0, 0.0], [0.0, 1.0]])
+
+        data = paddle.to_tensor(
+            [[1.0, 1.0], [2.0, 3.0]], dtype='float32', stop_gradient=False
+        )
+        val = self.test_op(data)
+        val.backward()
+
+        np.testing.assert_allclose(data.grad.numpy(), expected_grad)
+
+    def test_case2_reduce_dim(self):
+        """Test dim/keepdim"""
+        data = paddle.to_tensor(
+            [[[5, 8], [2, 1]], [[7, 3], [9, 6]]], dtype='float32'
+        )
+        if self.test_op_name.endswith("min"):
+            in_dim = 1
+            result = self.test_op(data, dim=in_dim)
+            expected_res = np.array([[[5, 3], [2, 1]]])
+            self.assertEqual(result.values.shape, [2, 2])
+            np.testing.assert_array_equal(
+                result.values.numpy(), np.array([[2, 1], [7, 3]])
+            )
+            np.testing.assert_array_equal(
+                result.indices.numpy(), np.array([[1, 1], [0, 0]])
+            )
+        else:
+            in_dim = 2
+            result = self.test_op(data, dim=in_dim)
+            expected_res = np.array([[[7, 8], [9, 6]]])
+            self.assertEqual(result.values.shape, [2, 2])
+            np.testing.assert_array_equal(
+                result.values.numpy(), np.array([[8, 2], [7, 9]])
+            )
+            np.testing.assert_array_equal(
+                result.indices.numpy(), np.array([[1, 0], [0, 0]])
+            )
+
+        result_keep = self.test_op(data, dim=0, keepdim=True)
+        self.assertEqual(result_keep.values.shape, [1, 2, 2])
+        np.testing.assert_array_equal(result_keep.values.numpy(), expected_res)
+
+        result_neg = self.test_op(data, dim=in_dim - 3)
+        np.testing.assert_array_equal(
+            result_neg.values.numpy(), result.values.numpy()
+        )
+
+    def test_case2_grad(self):
+        data = paddle.to_tensor(
+            [[[1.0, 2.0], [1.0, 3.0]], [[4.0, 1.0], [5.0, 1.0]]],
+            dtype='float32',
+            stop_gradient=False,
+        )
+        y = data * 2
+
+        result = self.test_op(y, dim=2)
+        result.values.backward()
+
+        if self.test_op_name.endswith("min"):
+            expected_grad = np.array(
+                [[[2.0, 0.0], [2.0, 0.0]], [[0.0, 2.0], [0.0, 2.0]]]
+            )
+            expected_grad2 = np.array(
+                [[[2.0, 4.0], [0.0, 0.0]], [[8.0, 2.0], [0.0, 0.0]]]
+            )
+        else:
+            expected_grad = np.array(
+                [[[0.0, 2.0], [0.0, 2.0]], [[2.0, 0.0], [2.0, 0.0]]]
+            )
+            expected_grad2 = np.array(
+                [[[2.0, 0.0], [0.0, 6.0]], [[0.0, 2.0], [10.0, 0.0]]]
+            )
+        np.testing.assert_allclose(data.grad.numpy(), expected_grad, atol=1e-6)
+
+        data.clear_grad()
+        y = data * data
+        result = self.test_op(y, dim=1)
+        result[0].backward()
+        np.testing.assert_allclose(data.grad.numpy(), expected_grad2, atol=1e-6)
+
+    def test_case3_elementwise(self):
+        x = paddle.to_tensor([[1, 5], [4, 2]], dtype='float32')
+        y = paddle.to_tensor([[3, 2], [1, 6]], dtype='float32')
+        z = paddle.to_tensor([3, 4], dtype='float32')
+        broadcast_res = self.test_op(x, z)
+
+        result = self.test_op(x, y)
+        if self.test_op_name.endswith("min"):
+            np.testing.assert_array_equal(
+                result.numpy(), np.array([[1, 2], [1, 2]])
+            )
+            np.testing.assert_array_equal(
+                broadcast_res.numpy(), np.array([[1, 4], [3, 2]])
+            )
+        else:
+            np.testing.assert_array_equal(
+                result.numpy(), np.array([[3, 5], [4, 6]])
+            )
+            np.testing.assert_array_equal(
+                broadcast_res.numpy(), np.array([[3, 5], [4, 4]])
+            )
+
+    def test_case3_grad(self):
+        x = paddle.to_tensor(
+            [[1.0, 2.0], [3.0, 4.0]], dtype=paddle.float32, stop_gradient=False
+        )
+        y = paddle.to_tensor(
+            [[0.5, 2.5], [2.0, 3.5]], dtype=paddle.float32, stop_gradient=False
+        )
+
+        val = self.test_op(x, y)
+        val.backward()
+
+        expected_x_grad = np.array([[0.0, 1.0], [0.0, 0.0]])
+        expected_y_grad = np.array([[1.0, 0.0], [1.0, 1.0]])
+        if self.test_op_name.endswith("max"):
+            expected_x_grad = 1 - expected_x_grad
+            expected_y_grad = 1 - expected_y_grad
+
+        np.testing.assert_allclose(x.grad.numpy(), expected_x_grad)
+        np.testing.assert_allclose(y.grad.numpy(), expected_y_grad)
+
+    def test_edge_cases(self):
+        """Edge cases test"""
+        # uniform distributed gradient
+        uniform_data = paddle.ones([2, 3], dtype='float64')
+        uniform_data.stop_gradient = False
+        val = self.test_op(uniform_data)
+        val.sum().backward()
+        # uniformly distributed
+        expected_grad = np.full((2, 3), 1.0 / 6.0)
+        np.testing.assert_allclose(uniform_data.grad.numpy(), expected_grad)
+
+        uniform_data.clear_grad()
+        val = self.test_op(uniform_data, 0)
+        val.values.sum().backward()
+        # take_along_axis like gradient behavior
+        expected_grad = np.array([[1.0, 1.0, 1.0], [0.0, 0.0, 0.0]])
+        np.testing.assert_allclose(uniform_data.grad.numpy(), expected_grad)
+
+        # 0-dim tensor
+        dim0_tensor = paddle.to_tensor(2, dtype='float32')
+        val = self.test_op(dim0_tensor)
+        np.testing.assert_allclose(val.numpy(), np.array(2.0, dtype=np.float32))
+
+        # 1-dim tensor
+        dim1_tensor = paddle.to_tensor([1], dtype='uint8')
+        val = self.test_op(dim1_tensor, dim=-1, keepdim=True)
+        np.testing.assert_array_equal(
+            val[0].numpy(), np.array([1], dtype=np.uint8)
+        )
+        np.testing.assert_array_equal(
+            val[1].numpy(), np.array([0], dtype=np.int64)
+        )
+
+    def test_compare_with_index_ops_to_origin(self):
+        dtypes = ['float32', 'float64', 'int32', 'int64', 'uint8']
+        cpu_reject_types = {'int16', 'bfloat16', 'float16'}
+
+        for i, dtype in enumerate(dtypes):
+            data = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], dtype=dtype)
+            # `bfloat16` and `float16` are rejected on CPU
+            if not data.place.is_gpu_place() and dtype in cpu_reject_types:
+                continue
+            vals_inds = self.test_op(data, dim=0)
+            self.assertEqual(vals_inds.values.dtype, data.dtype)
+            self.assertEqual(vals_inds.indices.dtype, paddle.int64)
+
+            origin_indices = self.index_op(data, axis=0, dtype="int64")
+            if dtype != 'uint8':
+                origin_values = self.origin_op(data, axis=0)
+            else:
+                origin_values = paddle.take_along_axis(
+                    data, origin_indices.unsqueeze(0), axis=0
+                )
+                origin_values.squeeze_(axis=0)
+            if i < 4:  # floating point
+                np.testing.assert_allclose(
+                    vals_inds.values.numpy(), origin_values.numpy()
+                )
+            else:
+                np.testing.assert_array_equal(
+                    vals_inds.values.numpy(), origin_values.numpy()
+                )
+            np.testing.assert_array_equal(
+                vals_inds[1].numpy(), origin_indices.numpy()
+            )
+
+    def test_error_handling(self):
+        """Test whether correct exception will be thrown. Skip error messages (some of them are long)"""
+
+        err_msg1 = (
+            "Tensors with integral type: 'paddle.int32' should stop gradient."
+        )
+        err_msg2 = (
+            f"{self.origin_op_name}() received unexpected keyword arguments 'input', 'dim'. "
+            f"\nDid you mean to use {self.test_op_name}() instead?"
+        )
+        err_msg3 = (
+            f"{self.test_op_name}() received unexpected keyword argument 'axis'. "
+            f"\nDid you mean to use {self.origin_op_name}() instead?"
+        )
+        err_msg4 = (
+            "Non-CUDA GPU placed Tensor does not have 'paddle.float16' op registered.\n"
+            "Paddle support following DataTypes: int32, int64, float64, float32, uint8"
+        )
+
+        # empty tensor
+        empty_tensor = paddle.to_tensor([], dtype='float32')
+        with self.assertRaises(ValueError):
+            self.test_op(empty_tensor)
+
+        # mixed parameters case 1
+        input_ts = paddle.to_tensor([1, 2, 3], dtype='float32')
+        other_ts = paddle.to_tensor([1])
+        with self.assertRaises(TypeError):
+            self.test_op(input_ts, other=other_ts, dim=0)
+
+        # mixed parameters case 2
+        with self.assertRaises(TypeError):
+            self.test_op(input_ts, 0, other=other_ts)
+
+        # trying to perform grad ops for integral types
+        with self.assertRaises(TypeError) as cm:
+            tensor = paddle.ones([2, 2], dtype=paddle.int32)
+            tensor.stop_gradient = False
+            tensors = self.test_op(tensor, dim=0)
+        self.assertEqual(str(cm.exception), err_msg1)
+
+        # explicit None case 1
+        with self.assertRaises(TypeError) as cm:
+            self.test_op(input_ts, dim=None)
+
+        # explicit None case 2
+        with self.assertRaises(TypeError) as cm:
+            self.test_op(input_ts, None, keepdim=True)
+
+        # keepdim specified without specifying dim
+        with self.assertRaises(TypeError) as cm:
+            self.test_op(input_ts, keepdim=True)
+
+        # Wrong *args specification case 1
+        with self.assertRaises(TypeError) as cm:
+            self.test_op(input_ts, False)
+
+        # Wrong *args specification case 2
+        with self.assertRaises(TypeError) as cm:
+            self.test_op(input_ts, other_ts, True)
+
+        # Tensor input for dim case 1
+        with self.assertRaises(TypeError) as cm:
+            self.test_op(input_ts, dim=paddle.to_tensor([0]))
+
+        # Tensor input for dim case 2
+        with self.assertRaises(TypeError) as cm:
+            self.test_op(input_ts, dim=paddle.to_tensor(0))
+
+        # Duplicate Arguments case 1
+        with self.assertRaises(TypeError) as cm:
+            self.test_op(input_ts, 0, dim=0)
+
+        # Duplicate Arguments case 2
+        with self.assertRaises(TypeError) as cm:
+            self.test_op(input_ts, other_ts, other=0)
+
+        # Duplicate Arguments case 3
+        with self.assertRaises(TypeError) as cm:
+            self.test_op(input_ts, dim=0, other=0, keepdim=True)
+
+        # Wrong API used case 1
+        with self.assertRaises(TypeError) as cm:
+            self.origin_op(input=input_ts, dim=0)
+        self.assertEqual(str(cm.exception), err_msg2)
+
+        # Wrong API used case 2
+        with self.assertRaises(TypeError) as cm:
+            self.test_op(input_ts, axis=0)
+        self.assertEqual(str(cm.exception), err_msg3)
+
+        # Rejected on CPU types
+        with self.assertRaises(TypeError) as cm:
+            tensor = paddle.to_tensor([1, 2, 3], dtype="float16")
+            cpu_tensor = tensor.to("cpu")
+            self.test_op(cpu_tensor, dim=0)
+        self.assertEqual(str(cm.exception), err_msg4)
+
+    def _compare_with_origin_static(self, input_shape, axis=0, keepdim=False):
+        if not paddle.is_compiled_with_cuda():
+            return
+        numel = 1
+        for v in input_shape:
+            numel *= v
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            input_tensor = paddle.arange(numel, dtype=paddle.float32).reshape(
+                input_shape
+            )
+
+            y = input_tensor**2 + input_tensor
+            values, indices = self.test_op(y, dim=axis, keepdim=keepdim)
+            values += 1
+
+            gt_values = self.origin_op(y, axis=axis, keepdim=keepdim) + 1
+            gt_indices = self.index_op(y, axis=axis, keepdim=keepdim)
+
+            place = paddle.CUDAPlace(0)
+            exe = paddle.static.Executor(place)
+            values_np, indices_np, gt_values_np, gt_indices_np = exe.run(
+                fetch_list=[values, indices, gt_values, gt_indices]
+            )
+            np.testing.assert_allclose(values_np, gt_values_np)
+            np.testing.assert_equal(indices_np, gt_indices_np)
+        paddle.disable_static()
+
+    def test_static_graph(self):
+        self._compare_with_origin_static([3, 10, 2], axis=1)
+        self._compare_with_origin_static([3, 10, 2], axis=0, keepdim=True)
+        self._compare_with_origin_static([17], axis=0)
+
+
+class TestCompatMax(TestCompatMinMaxBase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(
+            *args,
+            test_op=paddle.compat.max,
+            origin_op=paddle.max,
+            index_op=paddle.argmax,
+            test_op_name="paddle.compat.max",
+            origin_op_name="paddle.max",
+            **kwargs,
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_compat_split.py b/test/legacy_test/test_compat_split.py
new file mode 100644
index 00000000000000..8410e10e1e1caf
--- /dev/null
+++ b/test/legacy_test/test_compat_split.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.compat import split
+
+
+class TestCompatSplit(unittest.TestCase):
+    def _compare_with_origin(self, input_tensor, size, axis=0):
+        pd_results = split(input_tensor, size, dim=axis)
+
+        if isinstance(size, int):
+            shape_on_axis = input_tensor.shape[axis]
+            remaining_num = shape_on_axis % size
+            num_sections = shape_on_axis // size
+            if remaining_num == 0:
+                size = num_sections
+            else:
+                size = [size for _ in range(num_sections)]
+                size.append(remaining_num)
+
+        origin_results = paddle.split(
+            input_tensor, num_or_sections=size, axis=axis
+        )
+
+        self.assertEqual(len(origin_results), len(pd_results))
+
+        # check shape and output section size of the output
+        for origin_ts, pd_ts in zip(origin_results, pd_results):
+            np.testing.assert_allclose(origin_ts.numpy(), pd_ts.numpy())
+
+    def test_basic_split(self):
+        """Test basic splitting with integer size"""
+        data = paddle.arange(12).reshape([3, 4]).astype('float32')
+        self._compare_with_origin(data, 1, 0)
+        self._compare_with_origin(data, 2, 1)
+
+    def test_split_with_list_sections(self):
+        """Test splitting with list of section sizes"""
+        data = paddle.rand([10, 5])
+        self._compare_with_origin(data, [3, 2, 5], 0)
+        self._compare_with_origin(data, [1, 4], -1)
+
+    def test_chained_operations(self):
+        """Test split with complex operation chain"""
+        x = paddle.rand([8, 12])
+        y = paddle.sin(x) * 2.0 + paddle.exp(x) / 3.0
+        z = paddle.nn.functional.relu(y)
+
+        z1, z2 = split(z, 7, dim=1)
+
+        self.assertEqual(z1.shape, [8, 7])
+        self.assertEqual(z2.shape, [8, 5])
+
+        z_np = z.numpy()
+        np.testing.assert_allclose(z_np[:, :7], z1.numpy())
+        np.testing.assert_allclose(z_np[:, 7:], z2.numpy())
+
+    def test_split_grad(self):
+        """Test backprop for split, in1 and in2 are computed by
+        compat.split and original split"""
+
+        def get_tensors():
+            np.random.seed(114514)
+            np_arr = np.random.normal(0, 1, [2, 3, 4, 5])
+            return paddle.to_tensor(np_arr), paddle.to_tensor(np_arr)
+
+        in1, in2 = get_tensors()
+        in1.stop_gradient = False
+        in2.stop_gradient = False
+
+        def computation_graph(in_tensor):
+            y = in_tensor * 2.3 + 3.0
+            y = paddle.maximum(y, paddle.to_tensor([0], dtype=paddle.float32))
+            return y.mean(axis=0)
+
+        out1 = computation_graph(in1)
+        out2 = computation_graph(in2)
+
+        packs1 = paddle.compat.split(out1, 2, dim=2)
+        packs2 = paddle.split(out2, [2, 2, 1], axis=2)
+
+        res1 = packs1[0] + packs1[1] + packs1[2]
+        res2 = packs2[0] + packs2[1] + packs2[2]
+        res1.backward()
+        res2.backward()
+        np.testing.assert_allclose(in1.grad.numpy(), in2.grad.numpy())
+
+    def test_empty_dim(self):
+        """Split with empty dim"""
+        in_tensor = paddle.arange(72, dtype=paddle.int64).reshape([3, 12, 2])
+        self._compare_with_origin(in_tensor, [5, 0, 7], axis=1)
+
+    def test_split_with_one_block(self):
+        """Resulting tuple should be of length 1"""
+        in_tensor = paddle.arange(60, dtype=paddle.float32).reshape([3, 4, 5])
+        self._compare_with_origin(in_tensor, 5, paddle.to_tensor([-1]))
+        self._compare_with_origin(in_tensor, [5], paddle.to_tensor(2))
+
+    def test_edge_cases(self):
+        """Test edge cases and error handling"""
+        x = paddle.arange(5)
+        s1, s2 = split(x, [3, 2])
+        np.testing.assert_allclose(s1.numpy(), [0, 1, 2])
+        np.testing.assert_allclose(s2.numpy(), [3, 4])
+
+        x = paddle.rand([2, 2, 2])
+        a, b = split(x, 1, 2)
+        self.assertEqual(a.shape, [2, 2, 1])
+
+        # invalid split sections
+        with self.assertRaises(ValueError):
+            split(x, [3, 1], 1)
+
+        # invalid split axis
+        with self.assertRaises(ValueError):
+            split(x, 2, 3)
+
+    def test_error_hint(self):
+        """Test whether there will be correct exception when users pass paddle.split kwargs in paddle.compat.split, vice versa."""
+        x = paddle.randn([3, 9, 5])
+
+        msg_gt_1 = (
+            "paddle.split() received unexpected keyword arguments 'tensor', 'split_size_or_sections', 'dim'. "
+            "\nDid you mean to use paddle.compat.split() instead?"
+        )
+        msg_gt_2 = (
+            "paddle.compat.split() received unexpected keyword argument 'num_or_sections'. "
+            "\nDid you mean to use paddle.split() instead?"
+        )
+        msg_gt_3 = "(InvalidArgument) The dim is expected to be in range of [-3, 3), but got 3"
+        msg_gt_4 = "paddle.compat.split expects split_sizes have only non-negative entries, but got size = -5 on dim 2"
+
+        split_size = paddle.to_tensor([3])
+        msg_gt_5 = (
+            "The type of 'split_size_or_sections' in split must be int, list or tuple in imperative mode, but "
+            f"received {type(split_size)}."
+        )
+
+        with self.assertRaises(TypeError) as cm:
+            tensors = paddle.split(tensor=x, split_size_or_sections=3, dim=0)
+        self.assertEqual(str(cm.exception), msg_gt_1)
+
+        with self.assertRaises(TypeError) as cm:
+            tensors = split(x, num_or_sections=3, dim=0)
+        self.assertEqual(str(cm.exception), msg_gt_2)
+
+        with self.assertRaises(ValueError) as cm:
+            tensors = split(x, 3, dim=3)
+        self.assertEqual(str(cm.exception), msg_gt_3)
+
+        with self.assertRaises(ValueError) as cm:
+            tensors = split(x, [3, 3, -5], -2)
+        self.assertEqual(str(cm.exception), msg_gt_4)
+
+        with self.assertRaises(TypeError) as cm:
+            tensors = split(x, split_size, 1)
+        self.assertEqual(str(cm.exception), msg_gt_5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_compat_split_static.py b/test/legacy_test/test_compat_split_static.py
new file mode 100644
index 00000000000000..006e3ec30ea077
--- /dev/null
+++ b/test/legacy_test/test_compat_split_static.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.compat import split
+
+
+class TestCompatSplitStatic(unittest.TestCase):
+    def _compare_with_origin_static(
+        self, input_shape, size, axis=0, dim_rank=-1
+    ):
+        """size_dim: -1 means we input size by int, 0 means 0-size tensor, 1 means tensor with shape [1]"""
+        numel = 1
+        for v in input_shape:
+            numel *= v
+        input_axis = axis
+        if dim_rank == 0:
+            input_axis = paddle.to_tensor(axis)
+        elif dim_rank == 1:
+            input_axis = paddle.to_tensor([axis])
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            input_tensor = paddle.arange(numel, dtype=paddle.float32).reshape(
+                input_shape
+            )
+            pd_results = split(input_tensor, size, dim=input_axis)
+
+            if isinstance(size, int):
+                shape_on_axis = input_tensor.shape[axis]
+                remaining_num = shape_on_axis % size
+                num_sections = shape_on_axis // size
+                if remaining_num == 0:
+                    size = num_sections
+                else:
+                    size = [size for _ in range(num_sections)]
+                    size.append(remaining_num)
+
+            origin_results = paddle.split(
+                input_tensor, num_or_sections=size, axis=axis
+            )
+            assert len(pd_results) == len(origin_results), "length mismatched"
+            place = (
+                paddle.CUDAPlace(0)
+                if paddle.is_compiled_with_cuda()
+                else paddle.CPUPlace()
+            )
+            exe = paddle.static.Executor(place)
+            results = exe.run(fetch_list=[*origin_results, *pd_results])
+            length_needed = len(results) // 2
+            for i in range(length_needed):
+                np.testing.assert_allclose(
+                    results[i], results[i + length_needed]
+                )
+        paddle.disable_static()
+
+    def test_split_composite_static(self):
+        paddle.seed(114514)
+
+        def get_tensors():
+            np.random.seed(114514)
+            np_arr = np.random.normal(0, 1, [2, 3, 4, 5])
+            return paddle.to_tensor(np_arr), paddle.to_tensor(np_arr)
+
+        in1, in2 = get_tensors()
+        in1.stop_gradient = False
+        in2.stop_gradient = False
+
+        @paddle.jit.to_static
+        def computation_graph(in1: paddle.Tensor, in2: paddle.Tensor):
+            y1 = in1 * 1.5 + 1.0
+            y1 = paddle.minimum(y1, paddle.to_tensor([0], dtype=paddle.float32))
+            out1 = y1.mean(axis=0)
+
+            y2 = in2 * 1.5 + 1.0
+            y2 = paddle.minimum(y2, paddle.to_tensor([0], dtype=paddle.float32))
+            out2 = y2.mean(axis=0)
+
+            packs1 = paddle.compat.split(out1, 2, dim=2)
+            packs2 = paddle.split(out2, [2, 2, 1], axis=2)
+
+            res1 = packs1[0] + packs1[1] + packs1[2]
+            res2 = packs2[0] + packs2[1] + packs2[2]
+
+            return res1, res2
+
+        res1, res2 = computation_graph(in1, in2)
+        np.testing.assert_allclose(res1.numpy(), res2.numpy())
+
+    def test_static_graph(self):
+        """Test static graph execution"""
+        # fixed random seed for reproducibility
+        np.random.seed(114514)
+        # old static graph mode
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name='x', shape=[None, 6], dtype='float32')
+            result0, result1 = split(x, split_size_or_sections=[3, 3], dim=1)
+            output = result0 * 2.0 + paddle.sin(result1)
+
+            place = (
+                paddle.CUDAPlace(0)
+                if paddle.is_compiled_with_cuda()
+                else paddle.CPUPlace()
+            )
+            exe = paddle.static.Executor(place)
+
+            input_data = np.random.rand(3, 6).astype('float32')
+            feed = {'x': input_data}
+
+            results = exe.run(feed=feed, fetch_list=[result0, result1, output])
+
+            pd_result0, pd_result1 = results[0], results[1]
+            np.testing.assert_allclose(input_data[:, :3], pd_result0)
+            np.testing.assert_allclose(input_data[:, 3:], pd_result1)
+
+            expected_output = input_data[:, :3] * 2.0 + np.sin(
+                input_data[:, 3:]
+            )
+            np.testing.assert_allclose(
+                expected_output, results[2], rtol=1e-4, atol=1e-4
+            )
+
+        paddle.disable_static()
+
+    def test_error_hint(self):
+        """Test whether there will be correct exception when users pass paddle.split kwargs in paddle.compat.split, vice versa."""
+
+        msg_gt_1 = "split_size_or_sections must be greater than 0."
+        msg_gt_2 = "len(split_size_or_sections) must not be more than input.shape[dim]."
+        msg_gt_3 = "The type of 'split_size_or_sections' in split must be int, list or tuple in imperative mode."
+        msg_gt_4 = (
+            "'dim' is not allowed to be a pir.Value in a static graph: "
+            "\npir.Value can not be used for indexing python lists/tuples."
+        )
+
+        paddle.enable_static()
+        with self.assertRaises(AssertionError) as cm:
+            x = paddle.randn([3, 4, 5])
+            tensors = split(x, -2, dim=0)
+        self.assertEqual(str(cm.exception), msg_gt_1)
+
+        with self.assertRaises(AssertionError) as cm:
+            x = paddle.randn([3, 4, 5])
+            tensors = split(x, (1, 1, 1, 1, 2, 2), dim=-1)
+        self.assertEqual(str(cm.exception), msg_gt_2)
+
+        with self.assertRaises(TypeError) as cm:
+            x = paddle.randn([3, 4, 5])
+            tensors = split(x, paddle.to_tensor(2), dim=2)
+        self.assertEqual(str(cm.exception), msg_gt_3)
+
+        with self.assertRaises(TypeError) as cm:
+            x = paddle.randn([3, 4, 5])
+            tensors = split(x, 2, dim=paddle.to_tensor(2))
+        paddle.disable_static()
+        self.assertEqual(str(cm.exception), msg_gt_4)
+
+    def test_basic_split(self):
+        """Test basic splitting with integer size"""
+        input_shape = [3, 6]
+        self._compare_with_origin_static(input_shape, 1, 0)
+        self._compare_with_origin_static(input_shape, 3, -1)
+        self._compare_with_origin_static(input_shape, 4, dim_rank=0)
+        self._compare_with_origin_static(input_shape, 3, dim_rank=1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_complex_op.py b/test/legacy_test/test_complex_op.py
index d5ba26549d5a40..d0df015677f6b0 100644
--- a/test/legacy_test/test_complex_op.py
+++ b/test/legacy_test/test_complex_op.py
@@ -19,7 +19,7 @@
 
 import paddle
 from paddle import static
-from paddle.base import dygraph
+from paddle.base import core, dygraph
 
 paddle.enable_static()
 
@@ -134,6 +134,7 @@ def test_dygraph(self):
         np.testing.assert_allclose(self.out, out_np, rtol=1e-05)
 
     def test_static(self):
+        paddle.enable_static()
         mp, sp = static.Program(), static.Program()
         with static.program_guard(mp, sp):
             x = static.data("x", shape=[10, 10], dtype="float64")
@@ -148,5 +149,116 @@ def test_static(self):
         np.testing.assert_allclose(self.out, out_np, rtol=1e-05)
 
 
+class OutTest(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        if core.is_compiled_with_cuda():
+            self.place = core.CUDAPlace(0)
+        else:
+            self.place = core.CPUPlace()
+
+    def test_complex_api(self):
+        def run_complex(test_type):
+            x = paddle.arange(2, dtype=paddle.float32).unsqueeze(-1)
+            y = paddle.arange(3, dtype=paddle.float32)
+            x.stop_gradient = False
+            y.stop_gradient = False
+            z = paddle.ones([100])
+            z.stop_gradient = False
+
+            a = x + x
+            b = y + y
+            c = z + z
+
+            if test_type == "return":
+                c = paddle.complex(a, b)
+            elif test_type == "input_out":
+                paddle.complex(a, b, out=c)
+            elif test_type == "both_return":
+                c = paddle.complex(a, b, out=c)
+            elif test_type == "both_input_out":
+                tmp = paddle.complex(a, b, out=c)
+
+            out = paddle._C_ops.complex(a, b)
+            np.testing.assert_allclose(
+                out.numpy(),
+                c.numpy(),
+                1e-20,
+                1e-20,
+            )
+
+            d = c + c
+
+            d.mean().backward()
+
+            return c, x.grad, y.grad, z.grad
+
+        paddle.disable_static()
+        out1, x1, y1, z1 = run_complex("return")
+        out2, x2, y2, z2 = run_complex("input_out")
+        out3, x3, y3, z3 = run_complex("both_return")
+        out4, x4, y4, z4 = run_complex("both_input_out")
+
+        np.testing.assert_allclose(
+            out1.numpy(),
+            out2.numpy(),
+            1e-20,
+            1e-20,
+        )
+        np.testing.assert_allclose(
+            out1.numpy(),
+            out3.numpy(),
+            1e-20,
+            1e-20,
+        )
+        np.testing.assert_allclose(
+            out1.numpy(),
+            out4.numpy(),
+            1e-20,
+            1e-20,
+        )
+
+        np.testing.assert_allclose(
+            x1.numpy(),
+            x2.numpy(),
+            1e-20,
+            1e-20,
+        )
+        np.testing.assert_allclose(
+            x1.numpy(),
+            x3.numpy(),
+            1e-20,
+            1e-20,
+        )
+        np.testing.assert_allclose(
+            x1.numpy(),
+            x3.numpy(),
+            1e-20,
+            1e-20,
+        )
+        np.testing.assert_allclose(
+            y1.numpy(),
+            y2.numpy(),
+            1e-20,
+            1e-20,
+        )
+        np.testing.assert_allclose(
+            y1.numpy(),
+            y3.numpy(),
+            1e-20,
+            1e-20,
+        )
+        np.testing.assert_allclose(
+            y1.numpy(),
+            y4.numpy(),
+            1e-20,
+            1e-20,
+        )
+        np.testing.assert_equal(z1, None)
+        np.testing.assert_equal(z2, None)
+        np.testing.assert_equal(z3, None)
+        np.testing.assert_equal(z4, None)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_conv2d_op.py b/test/legacy_test/test_conv2d_op.py
index 1186f1c2720115..4ee915872aa85a 100644
--- a/test/legacy_test/test_conv2d_op.py
+++ b/test/legacy_test/test_conv2d_op.py
@@ -483,7 +483,7 @@ def setUp(self):
             'groups': self.groups,
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
             'data_format': self.data_format,
             'fuse_relu_before_depthwise_conv': self.fuse_relu_before_depthwise_conv,
             'exhaustive_search': self.exhaustive_search,
@@ -817,7 +817,7 @@ def setUp(self):
             'groups': self.groups,
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
             'data_format': self.data_format,
             'fuse_relu_before_depthwise_conv': self.fuse_relu_before_depthwise_conv,
             'exhaustive_search': self.exhaustive_search,
diff --git a/test/legacy_test/test_conv2d_transpose_op.py b/test/legacy_test/test_conv2d_transpose_op.py
index 1dbfeda253f482..f62e3b5277da6a 100644
--- a/test/legacy_test/test_conv2d_transpose_op.py
+++ b/test/legacy_test/test_conv2d_transpose_op.py
@@ -210,7 +210,7 @@ def setUp(self):
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
             'is_test': self.is_test,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
             'data_format': self.data_format,
         }
         if self.output_size is not None:
diff --git a/test/legacy_test/test_conv3d_op.py b/test/legacy_test/test_conv3d_op.py
index 65cc6c0c26431b..63c003118219f8 100644
--- a/test/legacy_test/test_conv3d_op.py
+++ b/test/legacy_test/test_conv3d_op.py
@@ -444,7 +444,7 @@ def setUp(self):
             'groups': self.groups,
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
             'data_format': self.data_format,
         }
         self.outputs = {'Output': output}
@@ -804,7 +804,7 @@ def setUp(self):
             'groups': self.groups,
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
             'data_format': self.data_format,
         }
         self.outputs = {'Output': output}
diff --git a/test/legacy_test/test_elementwise_add_op.py b/test/legacy_test/test_elementwise_add_op.py
index 4d8d2d2815d942..e0000e7d6aa992 100644
--- a/test/legacy_test/test_elementwise_add_op.py
+++ b/test/legacy_test/test_elementwise_add_op.py
@@ -47,7 +47,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
         self.outputs = {'Out': self.out}
 
     def check_dygraph(self):
@@ -244,7 +244,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(convert_float_to_uint16(self.x)),
             'Y': OpTest.np_dtype_to_base_dtype(convert_float_to_uint16(self.y)),
         }
-        self.attrs = {'axis': self.axis, 'use_mkldnn': False}
+        self.attrs = {'axis': self.axis, 'use_onednn': False}
         self.outputs = {'Out': convert_float_to_uint16(self.out)}
         self.if_enable_cinn()
 
@@ -827,7 +827,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.attrs = {'axis': -1, 'use_onednn': False}
         self.outputs = {'Out': self.out}
 
     def init_base_dtype(self):
@@ -968,7 +968,7 @@ def test_warnings(self):
                 type="elementwise_add",
                 inputs={'X': data, 'Y': data},
                 outputs={'Out': out},
-                attrs={'axis': 1, 'use_mkldnn': False},
+                attrs={'axis': 1, 'use_onednn': False},
             )
             self.assertTrue(
                 "op elementwise_add's attr axis = 1 is not the default value: -1"
@@ -1042,7 +1042,7 @@ def setUp(self):
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
 
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
         self.outputs = {'Out': self.out}
 
     def check_dygraph(self):
diff --git a/test/legacy_test/test_elementwise_div_op.py b/test/legacy_test/test_elementwise_div_op.py
index 0ff6dd4a26bac8..e6502ebef6146b 100644
--- a/test/legacy_test/test_elementwise_div_op.py
+++ b/test/legacy_test/test_elementwise_div_op.py
@@ -589,7 +589,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.attrs = {'axis': -1, 'use_onednn': False}
         self.outputs = {'Out': self.out}
 
     def init_base_dtype(self):
diff --git a/test/legacy_test/test_elementwise_floordiv_op.py b/test/legacy_test/test_elementwise_floordiv_op.py
index 1a8266f27beb75..186592c609e56a 100644
--- a/test/legacy_test/test_elementwise_floordiv_op.py
+++ b/test/legacy_test/test_elementwise_floordiv_op.py
@@ -43,7 +43,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
         self.outputs = {'Out': self.out}
 
     def test_check_output(self):
diff --git a/test/legacy_test/test_elementwise_mod_op.py b/test/legacy_test/test_elementwise_mod_op.py
index 3620215c186114..618643229d73ec 100644
--- a/test/legacy_test/test_elementwise_mod_op.py
+++ b/test/legacy_test/test_elementwise_mod_op.py
@@ -46,7 +46,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
         self.outputs = {'Out': self.out}
 
     def test_check_output(self):
@@ -195,7 +195,7 @@ def setUp(self):
             'X': convert_float_to_uint16(OpTest.np_dtype_to_base_dtype(self.x)),
             'Y': convert_float_to_uint16(OpTest.np_dtype_to_base_dtype(self.y)),
         }
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
         self.outputs = {'Out': convert_float_to_uint16(self.out)}
 
     def test_check_output(self):
diff --git a/test/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py
index a4f365ea92b1a8..8c6fbc679213af 100644
--- a/test/legacy_test/test_elementwise_mul_op.py
+++ b/test/legacy_test/test_elementwise_mul_op.py
@@ -49,7 +49,7 @@ def setUp(self):
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
         self.outputs = {'Out': self.out}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
 
     def test_check_output(self):
         # TODO(wangzhongpu): support onednn op in dygraph mode
@@ -242,7 +242,7 @@ def setUp(self):
             'Y': OpTest.np_dtype_to_base_dtype(convert_float_to_uint16(self.y)),
         }
         self.outputs = {'Out': convert_float_to_uint16(self.out)}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': False}
+        self.attrs = {'axis': self.axis, 'use_onednn': False}
         self.if_enable_cinn()
 
     def test_check_output(self):
@@ -381,7 +381,7 @@ def init_input_attr_output(self):
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
         self.outputs = {'Out': self.out}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
 
     def init_dtype(self):
         self.dtype = np.float64
@@ -406,7 +406,7 @@ def init_input_attr_output(self):
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
         self.outputs = {'Out': self.out}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
 
     def init_axis(self):
         self.axis = 0
@@ -592,7 +592,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.attrs = {'axis': -1, 'use_onednn': False}
         self.outputs = {'Out': self.out}
 
     def init_base_dtype(self):
diff --git a/test/legacy_test/test_elementwise_sub_op.py b/test/legacy_test/test_elementwise_sub_op.py
index 28e336539f868d..736f1b33d7f7c5 100644
--- a/test/legacy_test/test_elementwise_sub_op.py
+++ b/test/legacy_test/test_elementwise_sub_op.py
@@ -859,7 +859,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.attrs = {'axis': -1, 'use_onednn': False}
         self.outputs = {'Out': self.out}
         self.if_check_prim()
         self.if_enable_cinn()
@@ -1207,7 +1207,7 @@ def test_warnings(self):
                 type="elementwise_sub",
                 inputs={'X': data, 'Y': data},
                 outputs={'Out': out},
-                attrs={'axis': 1, 'use_mkldnn': False},
+                attrs={'axis': 1, 'use_onednn': False},
             )
             self.assertTrue(
                 "op elementwise_sub's attr axis = 1 is not the default value: -1"
diff --git a/test/legacy_test/test_expand_v2_op.py b/test/legacy_test/test_expand_v2_op.py
index 5e7ca4b28f92ef..ccf32a49665cbf 100644
--- a/test/legacy_test/test_expand_v2_op.py
+++ b/test/legacy_test/test_expand_v2_op.py
@@ -764,7 +764,7 @@ def setUp(self):
         self.init_place()
         self.python_api = paddle.expand
         self.x = np.zeros(self.ori_shape).astype("float32")
-        self.attrs = {'shape': self.shape, 'use_mkldnn': True}
+        self.attrs = {'shape': self.shape, 'use_onednn': True}
         self.use_onednn = True
         self.set_inputs()
         self.set_additional_inputs()
diff --git a/test/legacy_test/test_fc_op.py b/test/legacy_test/test_fc_op.py
index d61c93361097b7..a740ce0c49c304 100644
--- a/test/legacy_test/test_fc_op.py
+++ b/test/legacy_test/test_fc_op.py
@@ -73,7 +73,7 @@ def setUp(self):
             activation_type = "relu"
         else:
             activation_type = ""
-        self.attrs = {'use_mkldnn': False, 'activation_type': activation_type}
+        self.attrs = {'use_onednn': False, 'activation_type': activation_type}
 
         self.outputs = {
             'Out': fc_refer(self.matrix, self.with_bias, self.with_relu)
diff --git a/test/legacy_test/test_fused_transpose_split_quant_op.py b/test/legacy_test/test_fused_transpose_split_quant_op.py
index edfea14fc1f35d..6c8604ba2ea876 100644
--- a/test/legacy_test/test_fused_transpose_split_quant_op.py
+++ b/test/legacy_test/test_fused_transpose_split_quant_op.py
@@ -17,8 +17,20 @@
 import paddle
 
 
-def fused_transpose_split_quant_ref(x, tokens_per_expert, pow_2_scales):
+def dequant_ref(
+    fp8_tensor: paddle.Tensor, scale: paddle.Tensor, block_size: int = 128
+) -> paddle.Tensor:
+    """Helper function to dequantize fp8 tensor to bf16"""
+    expanded_scale = paddle.repeat_interleave(scale, repeats=128, axis=-1)
+    # Handle non-aligned cases by truncating
+    expanded_scale = expanded_scale[:, : fp8_tensor.shape[-1]]
+    return (fp8_tensor.astype('float32') * expanded_scale).astype('bfloat16')
+
+
+def fused_transpose_split_quant_ref(x, xscale, tokens_per_expert, pow_2_scales):
     shape = x.shape
+    if x.dtype == paddle.float8_e4m3fn:
+        x = dequant_ref(x, xscale)
     x = x.reshape([shape[0] // 128, 128, shape[1]])
     amax = x.astype('float32').abs().max(axis=1)
 
@@ -37,43 +49,76 @@ def fused_transpose_split_quant_ref(x, tokens_per_expert, pow_2_scales):
     return out, scale
 
 
-def test_fused_transpose_split_quant(tokens_per_expert, seq_len, pow_2_scales):
+def test_fused_transpose_split_quant(
+    tokens_per_expert, seq_len, pow_2_scales, using_fp8=False
+):
 
     x = paddle.randn([sum(tokens_per_expert), seq_len], dtype='bfloat16')
-    x = paddle.clip(x, min=-50, max=50)
+    if using_fp8:
+        x = x.cast('float8_e4m3fn')
+    xscale = (
+        paddle.randn(
+            [sum(tokens_per_expert), (seq_len + 127) // 128], dtype='float32'
+        )
+        if using_fp8
+        else None
+    )
+    # x = paddle.clip(x, min=-50, max=50)
 
     out, scale = paddle.incubate.nn.functional.fused_transpose_split_quant(
-        x, tokens_per_expert, pow_2_scales
+        x, xscale, tokens_per_expert, pow_2_scales
     )
 
     out_ref, scale_ref = fused_transpose_split_quant_ref(
-        x, tokens_per_expert, pow_2_scales
+        x, xscale, tokens_per_expert, pow_2_scales
     )
 
     for t, t_ref in zip(out, out_ref):
-        np.testing.assert_allclose(t.astype('float32'), t_ref.astype('float32'))
+        try:
+            np.testing.assert_allclose(
+                t.astype('float32'), t_ref.astype('float32')
+            )
+        except AssertionError as e:
+            print("AssertionError", e)
 
     for t, t_ref in zip(scale, scale_ref):
-        np.testing.assert_allclose(t, t_ref)
+        try:
+            np.testing.assert_allclose(t, t_ref)
+        except AssertionError as e:
+            print("AssertionError", e)
 
 
 def run():
-    test_fused_transpose_split_quant([0, 0], 1024, False)
-    test_fused_transpose_split_quant([128, 2 * 128], 0, True)
-    test_fused_transpose_split_quant([128], 1, False)
-    test_fused_transpose_split_quant([0, 128, 0, 2 * 128], 127, True)
-    test_fused_transpose_split_quant([3 * 128, 4 * 128, 5 * 128], 233, False)
-    test_fused_transpose_split_quant(
-        [24 * 128, 128, 50 * 128, 16 * 128], 2162, True
-    )
-    test_fused_transpose_split_quant(
-        [7 * 128, 29 * 128, 3 * 128, 128 * 128, 13 * 128], 4000, False
-    )
-    test_fused_transpose_split_quant(
-        [18 * 128, 5 * 128, 24 * 128, 128, 6 * 128, 0, 27 * 128, 7 * 128],
-        7168,
-        True,
-    )
+    fp8_choice = [True, False]
+    for using_fp8 in fp8_choice:
+        test_fused_transpose_split_quant(
+            [0, 0], 1024, False, using_fp8=using_fp8
+        )
+        test_fused_transpose_split_quant(
+            [128, 2 * 128], 0, True, using_fp8=using_fp8
+        )
+        test_fused_transpose_split_quant([128], 1, False, using_fp8=using_fp8)
+        test_fused_transpose_split_quant(
+            [0, 128, 0, 2 * 128], 127, True, using_fp8=using_fp8
+        )
+        test_fused_transpose_split_quant(
+            [3 * 128, 4 * 128, 5 * 128], 233, False, using_fp8=using_fp8
+        )
+        test_fused_transpose_split_quant(
+            [24 * 128, 128, 50 * 128, 16 * 128], 2162, True, using_fp8=using_fp8
+        )
+        test_fused_transpose_split_quant(
+            [7 * 128, 29 * 128, 3 * 128, 128 * 128, 13 * 128],
+            4000,
+            False,
+            using_fp8=using_fp8,
+        )
+        test_fused_transpose_split_quant(
+            [18 * 128, 5 * 128, 24 * 128, 128, 6 * 128, 0, 27 * 128, 7 * 128],
+            7168,
+            True,
+            using_fp8=using_fp8,
+        )
 
 
 if __name__ == '__main__':
diff --git a/test/legacy_test/test_fusion_gru_op.py b/test/legacy_test/test_fusion_gru_op.py
index 950142835e6524..80f2bd185876b5 100644
--- a/test/legacy_test/test_fusion_gru_op.py
+++ b/test/legacy_test/test_fusion_gru_op.py
@@ -111,7 +111,7 @@ def setUp(self):
             'gate_activation': self.act_gate,
             'is_reverse': self.is_reverse,
             'origin_mode': self.origin_mode,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
 
     def test_check_output(self):
diff --git a/test/legacy_test/test_gaussian_random_op.py b/test/legacy_test/test_gaussian_random_op.py
index c4f860bcc7e973..36b8453b097865 100644
--- a/test/legacy_test/test_gaussian_random_op.py
+++ b/test/legacy_test/test_gaussian_random_op.py
@@ -40,7 +40,7 @@ def setUp(self):
             "mean": self.mean,
             "std": self.std,
             "seed": 10,
-            "use_mkldnn": self.use_onednn,
+            "use_onednn": self.use_onednn,
         }
         paddle.seed(10)
 
@@ -82,7 +82,7 @@ def setUp(self):
             "std": self.std,
             "seed": 10,
             "dtype": paddle.float16,
-            "use_mkldnn": self.use_onednn,
+            "use_onednn": self.use_onednn,
         }
         paddle.seed(10)
 
@@ -134,7 +134,7 @@ def setUp(self):
             "std": self.std,
             "seed": 10,
             "dtype": paddle.bfloat16,
-            "use_mkldnn": self.use_onednn,
+            "use_onednn": self.use_onednn,
         }
         paddle.seed(10)
 
@@ -184,7 +184,7 @@ def setUp(self):
             'mean': self.mean,
             'std': self.std,
             'seed': self.seed,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
 
         self.inputs = {"ShapeTensorList": shape_tensor_list}
@@ -251,7 +251,7 @@ def setUp(self):
             'mean': self.mean,
             'std': self.std,
             'seed': self.seed,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
         self.outputs = {'Out': np.zeros((123, 92), dtype='float32')}
 
diff --git a/test/legacy_test/test_kron_op.py b/test/legacy_test/test_kron_op.py
index 05ff4b6dd777a4..7f634707a352f9 100644
--- a/test/legacy_test/test_kron_op.py
+++ b/test/legacy_test/test_kron_op.py
@@ -272,7 +272,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.attrs = {'axis': -1, 'use_onednn': False}
         self.outputs = {'Out': self.out}
 
     def init_base_dtype(self):
diff --git a/test/legacy_test/test_matmul_v2_op.py b/test/legacy_test/test_matmul_v2_op.py
index 5e1938dc704141..16bce228f637b5 100644
--- a/test/legacy_test/test_matmul_v2_op.py
+++ b/test/legacy_test/test_matmul_v2_op.py
@@ -713,7 +713,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.attrs = {'axis': -1, 'use_onednn': False}
         self.outputs = {'Out': self.out}
 
     def init_base_dtype(self):
@@ -766,7 +766,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.attrs = {'axis': -1, 'use_onednn': False}
         self.outputs = {'Out': self.out}
 
     def init_base_dtype(self):
@@ -828,7 +828,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.attrs = {'axis': -1, 'use_onednn': False}
         self.outputs = {'Out': self.out}
 
     def init_base_dtype(self):
@@ -854,7 +854,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.attrs = {'axis': -1, 'use_onednn': False}
         self.outputs = {'Out': self.out}
 
     def init_base_dtype(self):
@@ -880,7 +880,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.attrs = {'axis': -1, 'use_onednn': False}
         self.outputs = {'Out': self.out}
 
     def init_base_dtype(self):
@@ -906,7 +906,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.attrs = {'axis': -1, 'use_onednn': False}
         self.outputs = {'Out': self.out}
 
     def init_base_dtype(self):
@@ -950,7 +950,7 @@ def setUp(self):
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
         self.out = np.matmul(self.x, self.y)
-        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.attrs = {'axis': -1, 'use_onednn': False}
         self.outputs = {'Out': self.out}
 
     def init_input_output(self):
diff --git a/test/legacy_test/test_ones_op.py b/test/legacy_test/test_ones_op.py
index 3394bc611e7bfe..63ea2930633414 100644
--- a/test/legacy_test/test_ones_op.py
+++ b/test/legacy_test/test_ones_op.py
@@ -20,38 +20,121 @@
 
 
 class ApiOnesTest(unittest.TestCase):
-    def test_paddle_ones(self):
+    def test_static_ones(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            ones = paddle.ones(10, dtype=paddle.float32)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            (result,) = exe.run(fetch_list=[ones])
+            expect = np.ones([10], dtype="float32")
+        np.testing.assert_equal(result, expect)
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            ones = paddle.ones(10, 2, 3, dtype=paddle.float32)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            (result,) = exe.run(fetch_list=[ones])
+            expect = np.ones([10, 2, 3], dtype="float32")
+        np.testing.assert_equal(result, expect)
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            ones = paddle.ones([10, 2, 3], dtype=paddle.float32)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            (result,) = exe.run(fetch_list=[ones])
+            expect = np.ones([10, 2, 3], dtype="float32")
+        np.testing.assert_equal(result, expect)
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            ones = paddle.ones(size=[10, 2, 3], dtype=paddle.float32)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            (result,) = exe.run(fetch_list=[ones])
+            expect = np.ones([10, 2, 3], dtype="float32")
+        np.testing.assert_equal(result, expect)
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            ones = paddle.ones([10, 2, 3], paddle.float32)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            (result,) = exe.run(fetch_list=[ones])
+            expect = np.ones([10, 2, 3], dtype="float32")
+        np.testing.assert_equal(result, expect)
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            ones = paddle.ones([10, 2, 3], paddle.float32)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            (result,) = exe.run(fetch_list=[ones])
+            expect = np.ones([10, 2, 3], dtype="float32")
+        np.testing.assert_equal(result, expect)
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            ones = paddle.ones(shape=[10, 2, 3], dtype=paddle.float32)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            (result,) = exe.run(fetch_list=[ones])
+            expect = np.ones([10, 2, 3], dtype="float32")
+        np.testing.assert_equal(result, expect)
+
         with paddle.static.program_guard(paddle.static.Program()):
             ones = paddle.ones(shape=[10])
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             (result,) = exe.run(fetch_list=[ones])
-            expected_result = np.ones(10, dtype="float32")
-        self.assertEqual((result == expected_result).all(), True)
+            expect = np.ones(10, dtype="float32")
+        np.testing.assert_equal(result, expect)
 
         with paddle.static.program_guard(paddle.static.Program()):
             ones = paddle.ones(shape=[10], dtype="float64")
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             (result,) = exe.run(fetch_list=[ones])
-            expected_result = np.ones(10, dtype="float64")
-        self.assertEqual((result == expected_result).all(), True)
+            expect = np.ones(10, dtype="float64")
+        np.testing.assert_equal(result, expect)
 
         with paddle.static.program_guard(paddle.static.Program()):
             ones = paddle.ones(shape=[10], dtype="int64")
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             (result,) = exe.run(fetch_list=[ones])
-            expected_result = np.ones(10, dtype="int64")
-        self.assertEqual((result == expected_result).all(), True)
+            expect = np.ones(10, dtype="int64")
+        np.testing.assert_equal(result, expect)
 
         with paddle.static.program_guard(paddle.static.Program()):
             ones = paddle.ones(shape=10, dtype="int64")
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             (result,) = exe.run(fetch_list=[ones])
-            expected_result = np.ones(10, dtype="int64")
-        self.assertEqual((result == expected_result).all(), True)
+            expect = np.ones(10, dtype="int64")
+        np.testing.assert_equal(result, expect)
+        paddle.disable_static()
+
+    def test_dygraph_ones(self):
+        paddle.disable_static()
+        result = paddle.ones(10, dtype=paddle.float32)
+        expect = np.ones([10], dtype="float32")
+        np.testing.assert_equal(result, expect)
+
+        result = paddle.ones(10, 2, 3, dtype=paddle.float32)
+        expect = np.ones([10, 2, 3], dtype="float32")
+        np.testing.assert_equal(result, expect)
+
+        result = paddle.ones([10, 2, 3], dtype=paddle.float32)
+        np.testing.assert_equal(result, expect)
+
+        result = paddle.ones(size=[10, 2, 3], dtype=paddle.float32)
+        np.testing.assert_equal(result, expect)
+
+        result = paddle.ones([10, 2, 3], paddle.float32)
+        np.testing.assert_equal(result, expect)
+
+        result = paddle.ones([10, 2, 3], "float32")
+        np.testing.assert_equal(result, expect)
+
+        result = paddle.ones(shape=[10, 2, 3], dtype=paddle.float32)
+        np.testing.assert_equal(result, expect)
 
 
 if __name__ == "__main__":
diff --git a/test/legacy_test/test_pool2d_op.py b/test/legacy_test/test_pool2d_op.py
index 3c38b4a1ec9381..b2eea65d3caef0 100644
--- a/test/legacy_test/test_pool2d_op.py
+++ b/test/legacy_test/test_pool2d_op.py
@@ -451,7 +451,7 @@ def setUp(self):
             'pooling_type': self.pool_type,
             'global_pooling': self.global_pool,
             'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
             'ceil_mode': self.ceil_mode,
             'data_format': self.data_format,
             'exclusive': self.exclusive,
diff --git a/test/legacy_test/test_reshape_op.py b/test/legacy_test/test_reshape_op.py
index 06ff5633ba4b07..011ae2a55606d5 100755
--- a/test/legacy_test/test_reshape_op.py
+++ b/test/legacy_test/test_reshape_op.py
@@ -406,7 +406,7 @@ def setUp(self):
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(input)}
         self.attrs = {
             'shape': self.new_shape,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
         self.outputs = {
             "Out": self.inputs["X"].reshape(self.inferred_shape),
diff --git a/test/legacy_test/test_sgd_op_bf16.py b/test/legacy_test/test_sgd_op_bf16.py
index 25bacbbecf0aff..4cefc0c97df638 100644
--- a/test/legacy_test/test_sgd_op_bf16.py
+++ b/test/legacy_test/test_sgd_op_bf16.py
@@ -49,7 +49,7 @@ def setUp(self):
 
         self.inputs = {'Param': w_bf16, 'Grad': g_bf16, 'LearningRate': lr_bf16}
         self.outputs = {'ParamOut': w - lr * g}
-        self.attrs = {'use_mkldnn': self.use_onednn}
+        self.attrs = {'use_onednn': self.use_onednn}
 
     def conf(self):
         self.h = 102
@@ -157,7 +157,7 @@ def test_sparse_grad_sgd(self):
             Grad='Grad',
             ParamOut='Param',
             LearningRate='LearningRate',
-            use_mkldnn=True,
+            use_onednn=True,
         )
         sgd_op.run(scope, place)
 
@@ -215,7 +215,7 @@ def test_sparse_param_grad_sgd(self):
             Grad='Grad',
             ParamOut='Param',
             LearningRate='LearningRate',
-            use_mkldnn=True,
+            use_onednn=True,
         )
         sgd_op.run(scope, place)
 
diff --git a/test/legacy_test/test_slice_op.py b/test/legacy_test/test_slice_op.py
index 88e8b802c5a704..a75b4192ac986a 100644
--- a/test/legacy_test/test_slice_op.py
+++ b/test/legacy_test/test_slice_op.py
@@ -160,7 +160,7 @@ def setUp(self):
             'starts': self.starts,
             'ends': self.ends,
             'infer_flags': self.infer_flags,
-            'use_mkldnn': True,
+            'use_onednn': True,
         }
 
     def config(self):
diff --git a/test/legacy_test/test_softmax_op.py b/test/legacy_test/test_softmax_op.py
index d44102567b6c84..1b9ce32daac00c 100644
--- a/test/legacy_test/test_softmax_op.py
+++ b/test/legacy_test/test_softmax_op.py
@@ -78,7 +78,7 @@ def setUp(self):
         self.attrs = {
             'axis': self.axis,
             'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
         self.enable_cinn = True
 
@@ -161,7 +161,7 @@ def setUp(self):
         self.attrs = {
             'axis': -1,
             'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
         self.enable_cinn = False
 
@@ -210,7 +210,7 @@ def setUp(self):
         self.attrs = {
             'axis': -1,
             'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
         self.enable_cinn = False
 
@@ -487,7 +487,7 @@ def setUp(self):
         self.attrs = {
             'axis': self.axis,
             'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
 
     def init_cudnn(self):
diff --git a/test/legacy_test/test_sum_op.py b/test/legacy_test/test_sum_op.py
index 0746cc46d022a9..f310d4400e2847 100644
--- a/test/legacy_test/test_sum_op.py
+++ b/test/legacy_test/test_sum_op.py
@@ -37,7 +37,7 @@
 from paddle.framework import in_pir_mode
 
 
-def sum_wrapper(X, use_mkldnn=False):
+def sum_wrapper(X, use_onednn=False):
     res = paddle.full(shape=X[0].shape, fill_value=0.0, dtype=X[0].dtype)
     for x in X:
         res = paddle.add(res, x)
@@ -59,7 +59,7 @@ def setUp(self):
         self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
         y = x0 + x1 + x2
         self.outputs = {'Out': y}
-        self.attrs = {'use_mkldnn': self.use_onednn}
+        self.attrs = {'use_onednn': self.use_onednn}
 
     def init_kernel_type(self):
         self.dtype = np.float64
diff --git a/test/legacy_test/test_transpose_op.py b/test/legacy_test/test_transpose_op.py
index 69f2f55dd063a7..c229b0578a8724 100644
--- a/test/legacy_test/test_transpose_op.py
+++ b/test/legacy_test/test_transpose_op.py
@@ -35,7 +35,7 @@ def setUp(self):
         self.inputs = {'X': np.random.random(self.shape).astype("float64")}
         self.attrs = {
             'axis': list(self.axis),
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
         self.outputs = {
             'XShape': np.random.random(self.shape).astype("float64"),
@@ -146,7 +146,7 @@ def setUp(self):
         self.inputs = {'X': np.random.random(self.shape).astype("float64")}
         self.attrs = {
             'axis': list(self.axis),
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
         self.outputs = {
             'XShape': np.random.random(self.shape).astype("float64"),
@@ -169,7 +169,7 @@ def setUp(self):
         self.inputs = {'X': np.random.random(self.shape).astype("float64")}
         self.attrs = {
             'axis': list(self.axis),
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
         self.outputs = {
             'XShape': np.random.random(self.shape).astype("float64"),
@@ -191,7 +191,7 @@ def setUp(self):
         self.inputs = {'X': np.random.random(self.shape).astype("float64")}
         self.attrs = {
             'axis': list(self.axis),
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
         self.outputs = {
             'XShape': np.random.random(self.shape).astype("float64"),
@@ -234,7 +234,7 @@ def setUp(self):
         self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)}
         self.attrs = {
             'axis': list(self.axis),
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
         self.outputs = {
             'XShape': np.random.random(self.shape).astype(self.dtype),
@@ -279,7 +279,7 @@ def setUp(self):
         self.inputs = {'X': convert_float_to_uint16(x)}
         self.attrs = {
             'axis': list(self.axis),
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
         self.outputs = {
             'XShape': convert_float_to_uint16(
@@ -330,7 +330,7 @@ def setUp(self):
         self.inputs = {'X': x}
         self.attrs = {
             'axis': list(self.axis),
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
         self.outputs = {
             'XShape': np.random.random(self.shape).astype(self.dtype),
@@ -376,7 +376,7 @@ def setUp(self):
         self.inputs = {'X': convert_float_to_uint16(x)}
         self.attrs = {
             'axis': list(self.axis),
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
         self.outputs = {
             'XShape': convert_float_to_uint16(
diff --git a/test/mkldnn/onednn_op_test.py b/test/mkldnn/onednn_op_test.py
index 7eabd3b4d9c0ff..171000f910ded9 100644
--- a/test/mkldnn/onednn_op_test.py
+++ b/test/mkldnn/onednn_op_test.py
@@ -48,7 +48,7 @@ def check_if_onednn_primitives_exist_in_bwd(
                     'X': block.var('x'),
                 },
                 outputs={'Out': block.var('out')},
-                attrs={'use_mkldnn': True},
+                attrs={'use_onednn': True},
             )
 
             # Generate backward op_desc
@@ -122,7 +122,7 @@ def check_if_onednn_batchnorm_primitives_exist_in_bwd(
                     "epsilon": test_case.epsilon,
                     "is_test": False,
                     "data_layout": data_layout,
-                    "use_mkldnn": test_case.use_mkldnn,
+                    "use_onednn": test_case.use_onednn,
                     "fuse_with_relu": test_case.fuse_with_relu,
                     "use_global_stats": test_case.use_global_stats,
                 },
diff --git a/test/mkldnn/test_activation_bf16_mkldnn_op.py b/test/mkldnn/test_activation_bf16_mkldnn_op.py
index e5ac9d71a044a3..d9685692eb9a72 100644
--- a/test/mkldnn/test_activation_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_activation_bf16_mkldnn_op.py
@@ -39,7 +39,7 @@ def op_grad(self, dout, x):
         pass
 
     def set_attrs(self):
-        self.attrs = {"use_mkldnn": True}
+        self.attrs = {"use_onednn": True}
 
     def init_data(self):
         self.x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype(np.float32)
@@ -147,7 +147,7 @@ def op_grad(self, dout, x):
         )
 
     def set_attrs(self):
-        self.attrs = {"use_mkldnn": True, "approximate": True}
+        self.attrs = {"use_onednn": True, "approximate": True}
 
 
 class TestONEDNNGeluTanhDim2BF16Op(TestONEDNNGeluTanhBF16Op):
@@ -211,7 +211,7 @@ def op_grad(self, dout, x):
 
     def set_attrs(self):
         self.alpha = 0.2
-        self.attrs = {"use_mkldnn": True, "alpha": self.alpha}
+        self.attrs = {"use_onednn": True, "alpha": self.alpha}
 
 
 class TestONEDNNSwishBF16Op(ONEDNNBF16ActivationOp, TestActivation):
@@ -230,7 +230,7 @@ def op_grad(self, dout, x):
 
     def set_attrs(self):
         self.beta = 0.2
-        self.attrs = {"use_mkldnn": True, "beta": self.beta}
+        self.attrs = {"use_onednn": True, "beta": self.beta}
 
 
 class TestONEDNNHardSwishBF16Op(ONEDNNBF16ActivationOp, TestActivation):
@@ -284,7 +284,7 @@ def op_grad(self, dout, x):
 
     def set_attrs(self):
         self.alpha = 0.2
-        self.attrs = {"use_mkldnn": True, "alpha": self.alpha}
+        self.attrs = {"use_onednn": True, "alpha": self.alpha}
 
 
 class TestONEDNNExpBF16Op(ONEDNNBF16ActivationOp, TestActivation):
diff --git a/test/mkldnn/test_bilinear_interp_v2_mkldnn_op.py b/test/mkldnn/test_bilinear_interp_v2_mkldnn_op.py
index eca6ef8b9c7b0e..84970be1aaf057 100644
--- a/test/mkldnn/test_bilinear_interp_v2_mkldnn_op.py
+++ b/test/mkldnn/test_bilinear_interp_v2_mkldnn_op.py
@@ -145,7 +145,7 @@ def setUp(self):
             'out_w': self.out_w,
             'scale': self.scale,
             'data_layout': self.data_layout,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
         self.outputs = {'Out': output_np}
 
diff --git a/test/mkldnn/test_cast_mkldnn_op.py b/test/mkldnn/test_cast_mkldnn_op.py
index db12d0b21101bf..02aa59396208d7 100644
--- a/test/mkldnn/test_cast_mkldnn_op.py
+++ b/test/mkldnn/test_cast_mkldnn_op.py
@@ -42,7 +42,7 @@ def setUp(self):
         self.attrs = {
             'in_dtype': prepare_dtype(self.x),
             'out_dtype': prepare_dtype(self.out),
-            'use_mkldnn': True,
+            'use_onednn': True,
         }
         self.op_type = 'cast'
 
diff --git a/test/mkldnn/test_concat_bf16_mkldnn_op.py b/test/mkldnn/test_concat_bf16_mkldnn_op.py
index 606deb6976d4ac..0faf7e16482fb5 100644
--- a/test/mkldnn/test_concat_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_concat_bf16_mkldnn_op.py
@@ -35,7 +35,7 @@ def setUp(self):
         self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
         self.attrs = {
             'axis': self.axis,
-            'use_mkldnn': True,
+            'use_onednn': True,
             'mkldnn_data_type': self.onednn_data_type,
         }
 
diff --git a/test/mkldnn/test_concat_int8_mkldnn_op.py b/test/mkldnn/test_concat_int8_mkldnn_op.py
index 89d2b71c688807..7f25b41c4191ea 100644
--- a/test/mkldnn/test_concat_int8_mkldnn_op.py
+++ b/test/mkldnn/test_concat_int8_mkldnn_op.py
@@ -27,7 +27,7 @@ def setUp(self):
         self.init_shape()
         self.init_test_data()
         self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': True}
+        self.attrs = {'axis': self.axis, 'use_onednn': True}
 
         self.output = np.concatenate(
             (self.x0, self.x1, self.x2), axis=self.axis
diff --git a/test/mkldnn/test_conv2d_bf16_mkldnn_op.py b/test/mkldnn/test_conv2d_bf16_mkldnn_op.py
index da802ed21ba979..562595733933df 100644
--- a/test/mkldnn/test_conv2d_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_bf16_mkldnn_op.py
@@ -110,7 +110,7 @@ def setUp(self):
             'groups': self.groups,
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
             'mkldnn_data_type': self.onednn_data_type,
             'force_fp32_output': self.force_fp32_output,
             'fuse_residual_connection': self.fuse_residual,
diff --git a/test/mkldnn/test_conv2d_int8_mkldnn_op.py b/test/mkldnn/test_conv2d_int8_mkldnn_op.py
index d2e6d33607e4fe..23b3e938349b2f 100644
--- a/test/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -166,7 +166,7 @@ def residual_helper(init_low, init_high, output_):
             'groups': self.groups,
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
             'data_format': self.data_format,
             'exhaustive_search': self.exhaustive_search,
             'Scale_in': self.scale_in,
diff --git a/test/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py b/test/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py
index eaa12b49ee993f..5273b8c232a5b8 100644
--- a/test/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py
@@ -90,7 +90,7 @@ def setUp(self):
             'groups': self.groups,
             'dilations': self.dilations,
             'is_test': self.is_test,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
             'mkldnn_data_type': self.onednn_data_type,
             'force_fp32_output': self.force_fp32_output,
             'data_format': self.data_format,
diff --git a/test/mkldnn/test_elementwise_add_bf16_mkldnn_op.py b/test/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
index b51d7e989c371a..c552d1215267c6 100644
--- a/test/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
@@ -36,7 +36,7 @@ def setUp(self):
         self.y_bf16 = convert_float_to_uint16(self.y)
 
         self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
         self.outputs = {'Out': convert_float_to_uint16(self.out)}
 
     def generate_data(self):
diff --git a/test/mkldnn/test_elementwise_div_mkldnn_op.py b/test/mkldnn/test_elementwise_div_mkldnn_op.py
index 367c2b2b210e7b..f081f00e398a0e 100644
--- a/test/mkldnn/test_elementwise_div_mkldnn_op.py
+++ b/test/mkldnn/test_elementwise_div_mkldnn_op.py
@@ -37,7 +37,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
         self.outputs = {'Out': self.out}
 
     def init_input_output(self):
@@ -164,7 +164,7 @@ def setUp(self):
         self.x_bf16 = convert_float_to_uint16(self.x)
         self.y_bf16 = convert_float_to_uint16(self.y)
         self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
         self.outputs = {'Out': convert_float_to_uint16(self.out)}
 
     def init_dtype(self):
diff --git a/test/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py b/test/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py
index 8500c7dea868ba..b138c87f0cd477 100644
--- a/test/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py
@@ -35,7 +35,7 @@ def setUp(self):
         self.x_bf16 = convert_float_to_uint16(self.x)
         self.y_bf16 = convert_float_to_uint16(self.y)
         self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
         self.outputs = {'Out': convert_float_to_uint16(self.out)}
 
     def generate_data(self):
diff --git a/test/mkldnn/test_elementwise_sub_onednn_op.py b/test/mkldnn/test_elementwise_sub_onednn_op.py
index a9787c115109eb..51e30dd4d6bca4 100644
--- a/test/mkldnn/test_elementwise_sub_onednn_op.py
+++ b/test/mkldnn/test_elementwise_sub_onednn_op.py
@@ -44,7 +44,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
         self.outputs = {'Out': self.out}
 
     def init_input_output(self):
@@ -225,7 +225,7 @@ def setUp(self):
         self.x_bf16 = convert_float_to_uint16(self.x)
         self.y_bf16 = convert_float_to_uint16(self.y)
         self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
         self.outputs = {'Out': convert_float_to_uint16(self.out)}
 
     def init_dtype(self):
diff --git a/test/mkldnn/test_expand_v2_mkldnn_op.py b/test/mkldnn/test_expand_v2_mkldnn_op.py
index 8d30412e510dd0..3036069b50b010 100644
--- a/test/mkldnn/test_expand_v2_mkldnn_op.py
+++ b/test/mkldnn/test_expand_v2_mkldnn_op.py
@@ -30,7 +30,7 @@ def setUp(self):
         self.op_type = "expand_v2"
         self.init_data()
         self.x = np.random.random(self.ori_shape).astype("float32")
-        self.attrs = {'shape': self.shape, 'use_mkldnn': True}
+        self.attrs = {'shape': self.shape, 'use_onednn': True}
         self.set_inputs()
         self.set_additional_inputs()
         output = np.tile(self.x, self.expand_times)
diff --git a/test/mkldnn/test_fc_bf16_mkldnn_op.py b/test/mkldnn/test_fc_bf16_mkldnn_op.py
index 05c4d6775283fd..b04120c1e7e5a6 100644
--- a/test/mkldnn/test_fc_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_fc_bf16_mkldnn_op.py
@@ -60,7 +60,7 @@ def setUp(self):
         }
 
         self.attrs = {
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
             'force_fp32_output': self.force_fp32_output,
         }
 
diff --git a/test/mkldnn/test_fc_int8_mkldnn_op.py b/test/mkldnn/test_fc_int8_mkldnn_op.py
index da14db39df48da..353978b12b23d4 100644
--- a/test/mkldnn/test_fc_int8_mkldnn_op.py
+++ b/test/mkldnn/test_fc_int8_mkldnn_op.py
@@ -33,7 +33,7 @@ def setUp(self):
         )
 
         self.attrs = {
-            'use_mkldnn': True,
+            'use_onednn': True,
             'Scale_in': self.x_scale,
             'Scale_weights': [self.y_scale] * y_scales_size,
             'Scale_out': self.out_scale,
diff --git a/test/mkldnn/test_fc_mkldnn_op.py b/test/mkldnn/test_fc_mkldnn_op.py
index 3372238db9d9d4..b625cb57db35b1 100644
--- a/test/mkldnn/test_fc_mkldnn_op.py
+++ b/test/mkldnn/test_fc_mkldnn_op.py
@@ -45,7 +45,7 @@ def setUp(self):
             'Bias': self.bias,
         }
 
-        self.attrs = {'use_mkldnn': self.use_onednn}
+        self.attrs = {'use_onednn': self.use_onednn}
 
         self.outputs = {
             'Out': fully_connected_naive(
diff --git a/test/mkldnn/test_fill_constant_mkldnn_op.py b/test/mkldnn/test_fill_constant_mkldnn_op.py
index 562a0dd0ae503d..01d1feb83d06ea 100644
--- a/test/mkldnn/test_fill_constant_mkldnn_op.py
+++ b/test/mkldnn/test_fill_constant_mkldnn_op.py
@@ -57,7 +57,7 @@ def set_inputs(self):
         self.inputs = {}
 
     def set_attrs(self):
-        self.attrs = {'shape': (3, 5), 'use_mkldnn': True, 'value': self.value}
+        self.attrs = {'shape': (3, 5), 'use_onednn': True, 'value': self.value}
 
     def test_check_output(self):
         self.check_output(check_pir_onednn=True)
@@ -87,7 +87,7 @@ def set_inputs(self):
 class TestFillZerosLike2DStringValueInfOneDNNOp(TestFillConstant2DOneDNNOp):
     def set_attrs(self):
         self.str_value = "inf"
-        self.attrs = {'shape': (10, 13), 'use_mkldnn': True, 'str_value': "inf"}
+        self.attrs = {'shape': (10, 13), 'use_onednn': True, 'str_value': "inf"}
 
 
 class TestFillZerosLike2DStringValueMinusInfOneDNNOp(
@@ -97,7 +97,7 @@ def set_attrs(self):
         self.str_value = "-inf"
         self.attrs = {
             'shape': (10, 13),
-            'use_mkldnn': True,
+            'use_onednn': True,
             'str_value': "-inf",
         }
 
@@ -107,7 +107,7 @@ def set_attrs(self):
         self.str_value = "0.123"
         self.attrs = {
             'shape': (10, 13),
-            'use_mkldnn': True,
+            'use_onednn': True,
             'str_value': "0.123",
         }
 
diff --git a/test/mkldnn/test_flags_use_mkldnn.py b/test/mkldnn/test_flags_use_mkldnn.py
index 54b2be715809c9..01d483f9f9e2fe 100644
--- a/test/mkldnn/test_flags_use_mkldnn.py
+++ b/test/mkldnn/test_flags_use_mkldnn.py
@@ -22,7 +22,7 @@
 class TestFlagsUseOnednn(unittest.TestCase):
     def setUp(self):
         self._python_interp = sys.executable
-        self._python_interp += " check_flags_use_mkldnn.py"
+        self._python_interp += " check_flags_use_onednn.py"
 
         self.env = os.environ.copy()
         self.env["GLOG_v"] = "1"
diff --git a/test/mkldnn/test_flatten_mkldnn_op.py b/test/mkldnn/test_flatten_mkldnn_op.py
index 7bd90724082a17..2ba826e3ddc9ed 100644
--- a/test/mkldnn/test_flatten_mkldnn_op.py
+++ b/test/mkldnn/test_flatten_mkldnn_op.py
@@ -27,7 +27,7 @@ def setUp(self):
         self.set_op_type()
         self.init_test_case()
         self.set_inputs()
-        self.attrs = {"axis": self.axis, 'use_mkldnn': True}
+        self.attrs = {"axis": self.axis, 'use_onednn': True}
         self.ori_shape = self.inputs['X'].shape
         self.outputs = {"Out": self.inputs["X"].copy().reshape(self.new_shape)}
 
diff --git a/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py b/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
index e51b67888f402a..6248a7fe7e102e 100644
--- a/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
@@ -129,7 +129,7 @@ def setUp(self):
             'is_reverse': self.is_reverse,
             'origin_mode': self.origin_mode,
             'force_fp32_output': self.force_fp32_output,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
             'mkldnn_data_type': self.onednn_data_type,
         }
 
diff --git a/test/mkldnn/test_fusion_gru_int8_mkldnn_op.py b/test/mkldnn/test_fusion_gru_int8_mkldnn_op.py
index 043a5eaa074030..e88fce1507f884 100644
--- a/test/mkldnn/test_fusion_gru_int8_mkldnn_op.py
+++ b/test/mkldnn/test_fusion_gru_int8_mkldnn_op.py
@@ -141,7 +141,7 @@ def setUp(self):
             'gate_activation': self.act_gate,
             'is_reverse': self.is_reverse,
             'origin_mode': self.origin_mode,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
             'mkldnn_data_type': self.onednn_data_type,
             'force_fp32_output': self.force_fp32_output,
             'Scale_data': scale_data,
diff --git a/test/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py b/test/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
index f87b674f59c6ae..bff4586e3d0c0e 100644
--- a/test/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
@@ -145,7 +145,7 @@ def setUp(self):
             'cell_activation': self.act_cell,
             'candidate_activation': self.act_cand,
             'force_fp32_output': self.force_fp32_output,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
             'mkldnn_data_type': self.onednn_data_type,
         }
 
diff --git a/test/mkldnn/test_fusion_lstm_int8_mkldnn_op.py b/test/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
index 198bc2685cec49..c27e7b226fd283 100644
--- a/test/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
+++ b/test/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
@@ -130,7 +130,7 @@ def setUp(self):
             'candidate_activation': self.act_cand,
             'is_reverse': self.is_reverse,
             'use_peepholes': self.use_peepholes,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
             'mkldnn_data_type': self.onednn_data_type,
             'force_fp32_output': self.force_fp32_output,
             'Scale_data': scale_data,
diff --git a/test/mkldnn/test_gaussian_random_mkldnn_op.py b/test/mkldnn/test_gaussian_random_mkldnn_op.py
index 84bcea864c306f..d45c678769a857 100644
--- a/test/mkldnn/test_gaussian_random_mkldnn_op.py
+++ b/test/mkldnn/test_gaussian_random_mkldnn_op.py
@@ -40,7 +40,7 @@ def setUp(self):
             "mean": 1.0,
             "std": 2.0,
             "seed": 10,
-            "use_mkldnn": self.use_onednn,
+            "use_onednn": self.use_onednn,
         }
 
 
@@ -57,7 +57,7 @@ def setUp(self):
             "mean": self.mean,
             "std": self.std,
             "seed": 10,
-            "use_mkldnn": self.use_onednn,
+            "use_onednn": self.use_onednn,
         }
         paddle.seed(10)
 
diff --git a/test/mkldnn/test_log_softmax_mkldnn_op.py b/test/mkldnn/test_log_softmax_mkldnn_op.py
index 9f4807acb3fbc2..6d838bc86ff9c1 100644
--- a/test/mkldnn/test_log_softmax_mkldnn_op.py
+++ b/test/mkldnn/test_log_softmax_mkldnn_op.py
@@ -44,7 +44,7 @@ def setUp(self):
 
         self.inputs = {'X': x}
         self.outputs = {'Out': out}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': True}
+        self.attrs = {'axis': self.axis, 'use_onednn': True}
 
     def set_dtype(self):
         self.dtype = np.float32
diff --git a/test/mkldnn/test_lrn_mkldnn_op.py b/test/mkldnn/test_lrn_mkldnn_op.py
index 046bad391ee09b..874c73628d77a1 100644
--- a/test/mkldnn/test_lrn_mkldnn_op.py
+++ b/test/mkldnn/test_lrn_mkldnn_op.py
@@ -22,7 +22,7 @@
 class TestLRNONEDNNOp(TestLRNOp):
     def get_attrs(self):
         attrs = TestLRNOp.get_attrs(self)
-        attrs['use_mkldnn'] = True
+        attrs['use_onednn'] = True
         return attrs
 
     def test_check_output(self):
diff --git a/test/mkldnn/test_matmul_bf16_mkldnn_op.py b/test/mkldnn/test_matmul_bf16_mkldnn_op.py
index 8f9e932620714e..78a943e73d889d 100644
--- a/test/mkldnn/test_matmul_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_matmul_bf16_mkldnn_op.py
@@ -33,7 +33,7 @@ def generate_data(self):
     def set_attributes(self):
         self.attrs = {
             'alpha': self.alpha,
-            "use_mkldnn": self.use_onednn,
+            "use_onednn": self.use_onednn,
             "mkldnn_data_type": self.onednn_data_type,
             "force_fp32_output": self.force_fp32_output,
             'transpose_X': False,
@@ -146,7 +146,7 @@ def generate_data(self):
 
     def set_attributes(self):
         self.attrs = {
-            "use_mkldnn": self.use_onednn,
+            "use_onednn": self.use_onednn,
             "mkldnn_data_type": self.onednn_data_type,
             'transpose_X': True,
             'transpose_Y': False,
@@ -161,7 +161,7 @@ def generate_data(self):
 
     def set_attributes(self):
         self.attrs = {
-            "use_mkldnn": self.use_onednn,
+            "use_onednn": self.use_onednn,
             "mkldnn_data_type": self.onednn_data_type,
             'transpose_Y': True,
             'transpose_X': False,
diff --git a/test/mkldnn/test_matmul_v2_mkldnn_op.py b/test/mkldnn/test_matmul_v2_mkldnn_op.py
index 836fa86c6d43d6..4c132ebef63bb1 100644
--- a/test/mkldnn/test_matmul_v2_mkldnn_op.py
+++ b/test/mkldnn/test_matmul_v2_mkldnn_op.py
@@ -77,7 +77,7 @@ def setUp(self):
         self.attrs = {
             'trans_x': self.trans_x,
             'trans_y': self.trans_y,
-            'use_mkldnn': True,
+            'use_onednn': True,
         }
         self.set_dtype_attr()
         self.outputs = {'Out': result}
diff --git a/test/mkldnn/test_mul_int8_mkldnn_op.py b/test/mkldnn/test_mul_int8_mkldnn_op.py
index 71db940a027e0c..802a2e9d4aae73 100644
--- a/test/mkldnn/test_mul_int8_mkldnn_op.py
+++ b/test/mkldnn/test_mul_int8_mkldnn_op.py
@@ -35,7 +35,7 @@ def setUp(self):
         self.init_data_type()
         self.init_data()
         self.attrs = {
-            "use_mkldnn": self.use_onednn,
+            "use_onednn": self.use_onednn,
             "scale_x": self.scale_x,
             "scale_y": self.scale_y,
             "scale_out": self.scale_out,
@@ -106,7 +106,7 @@ def setUp(self):
         self.init_data_type()
         self.init_data()
         self.attrs = {
-            "use_mkldnn": self.use_onednn,
+            "use_onednn": self.use_onednn,
             "scale_x": self.scale_x,
             "scale_y": self.scale_y,
             "scale_out": self.scale_out,
diff --git a/test/mkldnn/test_mul_mkldnn_op.py b/test/mkldnn/test_mul_mkldnn_op.py
index 9759a581dbb4cf..d528631246b779 100644
--- a/test/mkldnn/test_mul_mkldnn_op.py
+++ b/test/mkldnn/test_mul_mkldnn_op.py
@@ -25,7 +25,7 @@
 class TestMulOneDNNOp(OpTest):
     def setUp(self):
         self.op_type = "mul"
-        self.attrs = {'use_mkldnn': True}
+        self.attrs = {'use_onednn': True}
         self.init_shapes_and_attrs()
 
         self.x_fp32 = np.random.random(self.x_shape).astype(np.float32)
diff --git a/test/mkldnn/test_multi_gru_mkldnn_op.py b/test/mkldnn/test_multi_gru_mkldnn_op.py
index f4d2b9cb9e60d9..ea6fc57bc94ae2 100644
--- a/test/mkldnn/test_multi_gru_mkldnn_op.py
+++ b/test/mkldnn/test_multi_gru_mkldnn_op.py
@@ -194,7 +194,7 @@ def setUp(self):
             'gate_activation': 'sigmoid',
             'layers': self.layers,
             'origin_mode': self.origin_mode,
-            'use_mkldnn': True,
+            'use_onednn': True,
         }
 
         if is_int8:
diff --git a/test/mkldnn/test_nearest_interp_v2_mkldnn_op.py b/test/mkldnn/test_nearest_interp_v2_mkldnn_op.py
index e1ae1bcf3b7c6b..caf65abd9cc4ea 100644
--- a/test/mkldnn/test_nearest_interp_v2_mkldnn_op.py
+++ b/test/mkldnn/test_nearest_interp_v2_mkldnn_op.py
@@ -146,7 +146,7 @@ def setUp(self):
             'out_w': self.out_w,
             'scale': self.scale,
             'data_layout': self.data_layout,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
         self.outputs = {'Out': output_np}
 
diff --git a/test/mkldnn/test_reduce_bf16_mkldnn_op.py b/test/mkldnn/test_reduce_bf16_mkldnn_op.py
index 91606f6bf6329e..b8f0e497bbdaad 100644
--- a/test/mkldnn/test_reduce_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_reduce_bf16_mkldnn_op.py
@@ -37,7 +37,7 @@ def setUp(self):
         self.x_bf16 = convert_float_to_uint16(self.x_fp32)
         self.inputs = {'X': self.x_bf16}
         self.outputs = {'Out': self.x_fp32.sum(axis=0)}
-        self.attrs = {'use_mkldnn': self.use_onednn}
+        self.attrs = {'use_onednn': self.use_onednn}
 
     def test_check_output(self):
         self.check_output(
@@ -100,7 +100,7 @@ def setUp(self):
         self.x_fp32 = np.random.normal(size=(2, 3, 5, 6)).astype('float32')
         self.x_bf16 = convert_float_to_uint16(self.x_fp32)
         self.inputs = {'X': self.x_bf16}
-        self.attrs = {'use_mkldnn': self.use_onednn, 'dim': [0, 1, 2, 3]}
+        self.attrs = {'use_onednn': self.use_onednn, 'dim': [0, 1, 2, 3]}
         self.outputs = {'Out': self.x_fp32.sum(axis=tuple(self.attrs['dim']))}
 
 
@@ -113,7 +113,7 @@ def setUp(self):
         self.x_fp32 = np.random.normal(size=(4, 7, 6, 6)).astype('float32')
         self.x_bf16 = convert_float_to_uint16(self.x_fp32)
         self.inputs = {'X': self.x_bf16}
-        self.attrs = {'use_mkldnn': self.use_onednn, 'dim': [-1, -2, -3, -4]}
+        self.attrs = {'use_onednn': self.use_onednn, 'dim': [-1, -2, -3, -4]}
         self.outputs = {'Out': self.x_fp32.sum(axis=tuple(self.attrs['dim']))}
 
 
@@ -126,7 +126,7 @@ def setUp(self):
         self.x_fp32 = np.random.normal(size=(2, 5, 3, 2, 5)).astype('float32')
         self.x_bf16 = convert_float_to_uint16(self.x_fp32)
         self.inputs = {'X': self.x_bf16}
-        self.attrs = {'reduce_all': True, 'keep_dim': True, 'use_mkldnn': True}
+        self.attrs = {'reduce_all': True, 'keep_dim': True, 'use_onednn': True}
         self.outputs = {'Out': self.x_fp32.sum(keepdims=self.attrs['keep_dim'])}
 
 
@@ -139,7 +139,7 @@ def setUp(self):
         self.x_fp32 = np.random.normal(size=(4, 5, 4, 5)).astype('float32')
         self.x_bf16 = convert_float_to_uint16(self.x_fp32)
         self.inputs = {'X': self.x_bf16}
-        self.attrs = {'reduce_all': True, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'reduce_all': True, 'use_onednn': self.use_onednn}
         self.outputs = {'Out': self.x_fp32.sum()}
 
 
@@ -156,7 +156,7 @@ def setUp(self):
         self.x_fp32 = np.random.random((5, 6, 10)).astype("float32")
         self.x_bf16 = convert_float_to_uint16(self.x_fp32)
         self.inputs = {'X': self.x_bf16}
-        self.attrs = {'dim': [-1], 'use_mkldnn': self.use_onednn}
+        self.attrs = {'dim': [-1], 'use_onednn': self.use_onednn}
         self.outputs = {'Out': self.x_fp32.max(axis=tuple(self.attrs['dim']))}
 
 
@@ -175,7 +175,7 @@ def setUp(self):
         self.x_fp32 = np.random.random((5, 6, 10, 9)).astype("float32")
         self.x_bf16 = convert_float_to_uint16(self.x_fp32)
         self.inputs = {'X': self.x_bf16}
-        self.attrs = {'dim': [-1, 0, 1], 'use_mkldnn': self.use_onednn}
+        self.attrs = {'dim': [-1, 0, 1], 'use_onednn': self.use_onednn}
         self.outputs = {'Out': self.x_fp32.max(axis=tuple(self.attrs['dim']))}
 
 
@@ -192,7 +192,7 @@ def setUp(self):
         self.x_fp32 = np.random.random((5, 6, 10)).astype("float32")
         self.x_bf16 = convert_float_to_uint16(self.x_fp32)
         self.inputs = {'X': self.x_bf16}
-        self.attrs = {'dim': [2], 'use_mkldnn': self.use_onednn}
+        self.attrs = {'dim': [2], 'use_onednn': self.use_onednn}
         self.outputs = {'Out': self.x_fp32.min(axis=tuple(self.attrs['dim']))}
 
 
@@ -203,7 +203,7 @@ def setUp(self):
         self.x_fp32 = np.random.random((5, 6, 10)).astype("float32")
         self.x_bf16 = convert_float_to_uint16(self.x_fp32)
         self.inputs = {'X': self.x_bf16}
-        self.attrs = {'use_mkldnn': self.use_onednn}
+        self.attrs = {'use_onednn': self.use_onednn}
         self.outputs = {'Out': self.x_fp32.sum(axis=0) / self.x_fp32.shape[0]}
 
 
@@ -214,7 +214,7 @@ def setUp(self):
         self.x_fp32 = np.random.random((5, 6, 3, 5)).astype("float32")
         self.x_bf16 = convert_float_to_uint16(self.x_fp32)
         self.inputs = {'X': self.x_bf16}
-        self.attrs = {'use_mkldnn': self.use_onednn, 'dim': [0, 1]}
+        self.attrs = {'use_onednn': self.use_onednn, 'dim': [0, 1]}
         self.outputs = {
             'Out': self.x_fp32.sum(axis=tuple(self.attrs['dim']))
             / (self.x_fp32.shape[0] * self.x_fp32.shape[1])
diff --git a/test/mkldnn/test_reshape_bf16_op.py b/test/mkldnn/test_reshape_bf16_op.py
index 94978e67d81468..587e348644c66a 100644
--- a/test/mkldnn/test_reshape_bf16_op.py
+++ b/test/mkldnn/test_reshape_bf16_op.py
@@ -35,7 +35,7 @@ def setUp(self):
         self.inputs = {'X': self.input_data}
         self.attrs = {
             'shape': self.new_shape,
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
             'mkldnn_data_type': self.onednn_data_type,
         }
         self.outputs = {
diff --git a/test/mkldnn/test_scale_bf16_mkldnn_op.py b/test/mkldnn/test_scale_bf16_mkldnn_op.py
index 26943471b285dd..2ababf6f4441d4 100644
--- a/test/mkldnn/test_scale_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_scale_bf16_mkldnn_op.py
@@ -35,7 +35,7 @@ def setUp(self):
         self.x_bf16 = convert_float_to_uint16(self.x_fp32)
         self.scale = -2.3
         self.inputs = {'X': self.x_bf16}
-        self.attrs = {'scale': self.scale, 'use_mkldnn': True, 'bias': 0.4}
+        self.attrs = {'scale': self.scale, 'use_onednn': True, 'bias': 0.4}
         self.use_onednn = True
         self.outputs = {
             'Out': (self.x_fp32 * self.attrs['scale']) + self.attrs['bias']
@@ -78,7 +78,7 @@ def setUp(self):
         self.inputs = {'X': self.x_bf16}
         self.attrs = {
             'scale': self.scale,
-            'use_mkldnn': True,
+            'use_onednn': True,
             'bias': 0.0,
             'bias_after_scale': False,
         }
@@ -99,7 +99,7 @@ def setUp(self):
             'X': self.x_bf16,
             'ScaleTensor': convert_float_to_uint16(self.scale_tensor),
         }
-        self.attrs = {'use_mkldnn': True}
+        self.attrs = {'use_onednn': True}
         self.outputs = {'Out': self.x_fp32 * self.scale}
 
 
@@ -117,7 +117,7 @@ def setUp(self):
         self.attrs = {
             'bias': -1.1,
             'bias_after_scale': False,
-            'use_mkldnn': True,
+            'use_onednn': True,
         }
         self.outputs = {'Out': (self.x_fp32 + self.attrs['bias']) * self.scale}
 
diff --git a/test/mkldnn/test_shuffle_channel_mkldnn_op.py b/test/mkldnn/test_shuffle_channel_mkldnn_op.py
index e9510c96369617..36e10885a6c707 100644
--- a/test/mkldnn/test_shuffle_channel_mkldnn_op.py
+++ b/test/mkldnn/test_shuffle_channel_mkldnn_op.py
@@ -28,7 +28,7 @@ def setUp(self):
         self.set_dtype()
         self.set_group()
         self.inputs = {'X': np.random.random((5, 64, 2, 3)).astype(self.dtype)}
-        self.attrs = {'use_mkldnn': True, 'group': self.group}
+        self.attrs = {'use_onednn': True, 'group': self.group}
 
         _, c, h, w = self.inputs['X'].shape
         input_reshaped = np.reshape(
diff --git a/test/mkldnn/test_slice_mkldnn_op.py b/test/mkldnn/test_slice_mkldnn_op.py
index 1a71278a9f2167..e95b9626add571 100644
--- a/test/mkldnn/test_slice_mkldnn_op.py
+++ b/test/mkldnn/test_slice_mkldnn_op.py
@@ -36,7 +36,7 @@ def setUp(self):
             'starts': self.starts,
             'ends': self.ends,
             'infer_flags': self.infer_flags,
-            'use_mkldnn': True,
+            'use_onednn': True,
         }
         self.set_attrs()
 
diff --git a/test/mkldnn/test_softmax_bf16_mkldnn_op.py b/test/mkldnn/test_softmax_bf16_mkldnn_op.py
index b52dda9aa724ce..31b16cb38e0079 100644
--- a/test/mkldnn/test_softmax_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_softmax_bf16_mkldnn_op.py
@@ -64,7 +64,7 @@ def setUp(self):
 
         self.inputs = {'X': convert_float_to_uint16(x)}
         self.outputs = {'Out': out}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
 
     def test_check_output(self):
         self.check_output_with_place(core.CPUPlace(), check_pir_onednn=True)
diff --git a/test/mkldnn/test_softplus_mkldnn_op.py b/test/mkldnn/test_softplus_mkldnn_op.py
index 0949b63cc2c59d..5903a9faf32193 100644
--- a/test/mkldnn/test_softplus_mkldnn_op.py
+++ b/test/mkldnn/test_softplus_mkldnn_op.py
@@ -37,7 +37,7 @@ def setUp(self):
         self.threshold = 20
         self.config()
         self.set_dtype()
-        self.attrs = {'use_mkldnn': True, 'beta': self.beta}
+        self.attrs = {'use_onednn': True, 'beta': self.beta}
         self.x = np.random.random(self.x_shape)
         self.out = ref_softplus(self.x, self.beta, self.threshold)
 
diff --git a/test/mkldnn/test_split_bf16_mkldnn_op.py b/test/mkldnn/test_split_bf16_mkldnn_op.py
index ae8edba09fc74d..3234941a8ed553 100644
--- a/test/mkldnn/test_split_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_split_bf16_mkldnn_op.py
@@ -45,7 +45,7 @@ def setUp(self):
         self.init_data()
         self.inputs = {'X': self.x}
         self.attrs = {
-            'use_mkldnn': True,
+            'use_onednn': True,
             'num': self.num,
             'mkldnn_data_type': "bfloat16",
         }
diff --git a/test/mkldnn/test_squeeze2_mkldnn_op.py b/test/mkldnn/test_squeeze2_mkldnn_op.py
index fc0f731f35b681..9e2a4bb774b99f 100644
--- a/test/mkldnn/test_squeeze2_mkldnn_op.py
+++ b/test/mkldnn/test_squeeze2_mkldnn_op.py
@@ -38,7 +38,7 @@ def set_inputs(self):
         self.inputs = {"X": self.x}
 
     def init_attrs(self):
-        self.attrs = {"axes": self.axes, 'use_mkldnn': True}
+        self.attrs = {"axes": self.axes, 'use_onednn': True}
 
     def set_outputs(self):
         self.outputs = {
diff --git a/test/mkldnn/test_stack_mkldnn_op.py b/test/mkldnn/test_stack_mkldnn_op.py
index 8b91c246d6e6b0..2bd48e74a377e1 100644
--- a/test/mkldnn/test_stack_mkldnn_op.py
+++ b/test/mkldnn/test_stack_mkldnn_op.py
@@ -56,7 +56,7 @@ def setUp(self):
 
         self.inputs = {'X': input_list}
         self.outputs = {'Y': np.stack(self.op_inputs, axis=self.axis)}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': True}
+        self.attrs = {'axis': self.axis, 'use_onednn': True}
 
     def test_check_output(self):
         self.check_output_with_place(core.CPUPlace(), check_pir_onednn=True)
diff --git a/test/mkldnn/test_sum_bf16_mkldnn_op.py b/test/mkldnn/test_sum_bf16_mkldnn_op.py
index 341a17416df3e4..9bc17c6c168fa3 100644
--- a/test/mkldnn/test_sum_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_sum_bf16_mkldnn_op.py
@@ -45,7 +45,7 @@ def setUp(self):
 
         y = x0 + x1 + x2
         self.outputs = {'Out': convert_float_to_uint16(y)}
-        self.attrs = {'use_mkldnn': self.use_onednn}
+        self.attrs = {'use_onednn': self.use_onednn}
 
     def test_check_output(self):
         self.check_output_with_place(core.CPUPlace(), check_pir_onednn=True)
diff --git a/test/mkldnn/test_transpose_bf16_mkldnn_op.py b/test/mkldnn/test_transpose_bf16_mkldnn_op.py
index 8f0d5e9a952143..89c597a6d0de25 100644
--- a/test/mkldnn/test_transpose_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_transpose_bf16_mkldnn_op.py
@@ -37,7 +37,7 @@ def setUp(self):
 
         self.attrs = {
             'axis': list(self.axis),
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
             'mkldnn_data_type': self.onednn_data_type,
         }
 
diff --git a/test/mkldnn/test_transpose_int8_mkldnn_op.py b/test/mkldnn/test_transpose_int8_mkldnn_op.py
index eefdc3dae12fb4..65205a9511c42f 100644
--- a/test/mkldnn/test_transpose_int8_mkldnn_op.py
+++ b/test/mkldnn/test_transpose_int8_mkldnn_op.py
@@ -36,7 +36,7 @@ def setUp(self):
 
         self.attrs = {
             'axis': list(self.axis),
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
 
         self.outputs = {
diff --git a/test/mkldnn/test_transpose_mkldnn_op.py b/test/mkldnn/test_transpose_mkldnn_op.py
index 125128a73b131d..f4a4bdaf173d9b 100644
--- a/test/mkldnn/test_transpose_mkldnn_op.py
+++ b/test/mkldnn/test_transpose_mkldnn_op.py
@@ -25,7 +25,7 @@ def setUp(self):
         self.inputs = {'X': np.random.random(self.shape).astype("float32")}
         self.attrs = {
             'axis': list(self.axis),
-            'use_mkldnn': self.use_onednn,
+            'use_onednn': self.use_onednn,
         }
         self.outputs = {
             'XShape': np.random.random(self.shape).astype("float32"),
diff --git a/test/quantization/CMakeLists.txt b/test/quantization/CMakeLists.txt
index 20082befcba268..c2f533b9b31d8c 100644
--- a/test/quantization/CMakeLists.txt
+++ b/test/quantization/CMakeLists.txt
@@ -6,13 +6,13 @@ file(
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 function(_inference_analysis_python_api_int8_test target model_dir data_path
-         filename use_mkldnn)
+         filename use_onednn)
   py_test(
     ${target}
     SRCS ${filename}
          ENVS
          CPU_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-         FLAGS_use_onednn=${use_mkldnn}
+         FLAGS_use_onednn=${use_onednn}
          ARGS
          --infer_model
          ${model_dir}/model
diff --git a/test/quantization/README.md b/test/quantization/README.md
index eeb4b838fe7648..3137a49be0e5d3 100644
--- a/test/quantization/README.md
+++ b/test/quantization/README.md
@@ -264,7 +264,7 @@ The following options are also accepted:
 
 ```bash
 cd /PATH/TO/PADDLE
-OMP_NUM_THREADS=28 FLAGS_use_mkldnn=true python python/paddle/static/quantization/slim/tests/quant2_int8_image_classification_comparison.py --quant_model=/PATH/TO/DOWNLOADED/QUANT/MODEL --fp32_model=/PATH/TO/DOWNLOADED/FP32/MODEL --infer_data=$HOME/.cache/paddle/dataset/int8/download/int8_full_val.bin --batch_size=50 --batch_num=1000 --acc_diff_threshold=0.01 --ops_to_quantize="conv2d,pool2d"
+OMP_NUM_THREADS=28 FLAGS_use_onednn=true python python/paddle/static/quantization/slim/tests/quant2_int8_image_classification_comparison.py --quant_model=/PATH/TO/DOWNLOADED/QUANT/MODEL --fp32_model=/PATH/TO/DOWNLOADED/FP32/MODEL --infer_data=$HOME/.cache/paddle/dataset/int8/download/int8_full_val.bin --batch_size=50 --batch_num=1000 --acc_diff_threshold=0.01 --ops_to_quantize="conv2d,pool2d"
 ```
 
 > Notes: Due to a large amount of images in the `int8_full_val.bin` dataset (50 000), the accuracy benchmark may last long. To accelerate accuracy measuring, it is recommended to set `OMP_NUM_THREADS` to the maximum number of physical cores available on the server.
diff --git a/test/xpu/op_test_xpu.py b/test/xpu/op_test_xpu.py
index 956506bd47e1c0..875280639bcec7 100644
--- a/test/xpu/op_test_xpu.py
+++ b/test/xpu/op_test_xpu.py
@@ -292,8 +292,8 @@ def get_grad_with_place(
 
             # oneDNN numeric gradient should use CPU kernel
             use_onednn = False
-            if op_attrs.get("use_mkldnn"):
-                op_attrs["use_mkldnn"] = False
+            if op_attrs.get("use_onednn"):
+                op_attrs["use_onednn"] = False
                 use_onednn = True
 
             mean_grad_op_types = get_xpu_op_support_types('mean')
@@ -311,7 +311,7 @@ def get_grad_with_place(
             )
 
             if use_onednn:
-                op_attrs["use_mkldnn"] = True
+                op_attrs["use_onednn"] = True
 
             if no_grad_set is None:
                 no_grad_set = set()
diff --git a/test/xpu/test_batch_norm_op_xpu.py b/test/xpu/test_batch_norm_op_xpu.py
index 97ab78297934dd..6bbc3efe16c7f2 100644
--- a/test/xpu/test_batch_norm_op_xpu.py
+++ b/test/xpu/test_batch_norm_op_xpu.py
@@ -448,7 +448,7 @@ def test_train(self):
                     "epsilon": self.epsilon,
                     "is_test": False,
                     "data_layout": self.data_layout,
-                    "use_mkldnn": False,
+                    "use_onednn": False,
                     "fuse_with_relu": False,
                     "use_global_stats": self.use_global_stats,
                 }
diff --git a/test/xpu/test_conv2d_op_xpu.py b/test/xpu/test_conv2d_op_xpu.py
index e93f5b89e35d0e..16b80018905c3e 100644
--- a/test/xpu/test_conv2d_op_xpu.py
+++ b/test/xpu/test_conv2d_op_xpu.py
@@ -241,7 +241,7 @@ def setUp(self):
                 'groups': self.groups,
                 'dilations': self.dilations,
                 'use_cudnn': self.use_cudnn,
-                'use_mkldnn': self.use_onednn,
+                'use_onednn': self.use_onednn,
                 'data_format': self.data_format,
                 'fuse_relu_before_depthwise_conv': self.fuse_relu_before_depthwise_conv,
                 'exhaustive_search': self.exhaustive_search,
@@ -402,7 +402,7 @@ def setUp(self):
                 'groups': self.groups,
                 'dilations': self.dilations,
                 'use_cudnn': self.use_cudnn,
-                'use_mkldnn': self.use_onednn,
+                'use_onednn': self.use_onednn,
                 'data_format': self.data_format,
                 'fuse_relu_before_depthwise_conv': self.fuse_relu_before_depthwise_conv,
                 'exhaustive_search': self.exhaustive_search,
diff --git a/test/xpu/test_conv2d_transpose_op_xpu.py b/test/xpu/test_conv2d_transpose_op_xpu.py
index 487fa004c105c9..8d7070a6697c5e 100644
--- a/test/xpu/test_conv2d_transpose_op_xpu.py
+++ b/test/xpu/test_conv2d_transpose_op_xpu.py
@@ -168,7 +168,7 @@ def setUp(self):
                 'dilations': self.dilations,
                 'use_cudnn': self.use_cudnn,
                 'is_test': self.is_test,
-                'use_mkldnn': self.use_onednn,
+                'use_onednn': self.use_onednn,
                 'data_format': self.data_format,
             }
             if self.output_size is not None:
diff --git a/test/xpu/test_conv3d_op_xpu.py b/test/xpu/test_conv3d_op_xpu.py
index b198370a87767a..6a96930339129a 100644
--- a/test/xpu/test_conv3d_op_xpu.py
+++ b/test/xpu/test_conv3d_op_xpu.py
@@ -244,7 +244,7 @@ def setUp(self):
                 'groups': self.groups,
                 'dilations': self.dilations,
                 'use_cudnn': self.use_cudnn,
-                'use_mkldnn': self.use_onednn,
+                'use_onednn': self.use_onednn,
                 'data_format': self.data_format,
             }
             self.outputs = {'Output': output}
@@ -419,7 +419,7 @@ def setUp(self):
                 'groups': self.groups,
                 'dilations': self.dilations,
                 'use_cudnn': self.use_cudnn,
-                'use_mkldnn': self.use_onednn,
+                'use_onednn': self.use_onednn,
                 'data_format': self.data_format,
             }
             self.outputs = {'Output': output}
diff --git a/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py b/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py
index 578cd3b9c88d85..7c59ded26f6792 100644
--- a/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py
+++ b/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py
@@ -168,7 +168,7 @@ def setUp(self):
                 'dilations': self.dilations,
                 'use_cudnn': self.use_cudnn,
                 'is_test': self.is_test,
-                'use_mkldnn': self.use_onednn,
+                'use_onednn': self.use_onednn,
                 'data_format': self.data_format,
             }
             if self.output_size is not None:
diff --git a/test/xpu/test_elementwise_add_op_xpu.py b/test/xpu/test_elementwise_add_op_xpu.py
index ee0c70d75b3341..7f8fc159b1d588 100644
--- a/test/xpu/test_elementwise_add_op_xpu.py
+++ b/test/xpu/test_elementwise_add_op_xpu.py
@@ -49,7 +49,7 @@ def setUp(self):
                 'X': OpTest.np_dtype_to_base_dtype(self.x),
                 'Y': OpTest.np_dtype_to_base_dtype(self.y),
             }
-            self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+            self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
             self.outputs = {'Out': self.out}
 
         def test_check_output(self):
diff --git a/test/xpu/test_elementwise_add_op_xpu_kp.py b/test/xpu/test_elementwise_add_op_xpu_kp.py
index 857e8d72b188cc..d3ef8e332c06e0 100644
--- a/test/xpu/test_elementwise_add_op_xpu_kp.py
+++ b/test/xpu/test_elementwise_add_op_xpu_kp.py
@@ -39,7 +39,7 @@ def setUp(self):
             'X': OpTest.np_dtype_to_base_dtype(self.x),
             'Y': OpTest.np_dtype_to_base_dtype(self.y),
         }
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+        self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
         self.outputs = {'Out': self.out}
 
     def test_check_output(self):
diff --git a/test/xpu/test_elementwise_floordiv_op_xpu.py b/test/xpu/test_elementwise_floordiv_op_xpu.py
index f5e1a0ecc8356a..a4795874874a21 100644
--- a/test/xpu/test_elementwise_floordiv_op_xpu.py
+++ b/test/xpu/test_elementwise_floordiv_op_xpu.py
@@ -50,7 +50,7 @@ def setUp(self):
                 'X': OpTest.np_dtype_to_base_dtype(self.x),
                 'Y': OpTest.np_dtype_to_base_dtype(self.y),
             }
-            self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+            self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
             self.outputs = {'Out': self.out}
 
         def test_check_output(self):
diff --git a/test/xpu/test_elementwise_mod_op_xpu.py b/test/xpu/test_elementwise_mod_op_xpu.py
index 035595d2e36e84..b3d212ada318c2 100644
--- a/test/xpu/test_elementwise_mod_op_xpu.py
+++ b/test/xpu/test_elementwise_mod_op_xpu.py
@@ -48,7 +48,7 @@ def init_input_output(self):
                 'Y': OpTest.np_dtype_to_base_dtype(self.y),
             }
             self.outputs = {'Out': self.out}
-            self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+            self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
 
         def init_dtype(self):
             pass
@@ -81,7 +81,7 @@ def init_input_output(self):
                 'Y': OpTest.np_dtype_to_base_dtype(self.y),
             }
             self.outputs = {'Out': self.out}
-            self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+            self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
 
     class TestRemainderOp(unittest.TestCase):
         def test_dygraph(self):
diff --git a/test/xpu/test_elementwise_mul_op_xpu.py b/test/xpu/test_elementwise_mul_op_xpu.py
index c50de0285d66c1..c7116ea8f42905 100644
--- a/test/xpu/test_elementwise_mul_op_xpu.py
+++ b/test/xpu/test_elementwise_mul_op_xpu.py
@@ -126,7 +126,7 @@ def init_input_output(self):
                 'Y': self.y,
             }
             self.outputs = {'Out': self.out}
-            self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+            self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
 
         def init_axis(self):
             pass
@@ -281,7 +281,7 @@ def init_input_output(self):
                 'Y': self.y,
             }
             self.outputs = {'Out': self.out}
-            self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn}
+            self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn}
 
         def gen_output(self):
             if self.cal_x is None:
diff --git a/test/xpu/test_gaussian_random_op_xpu.py b/test/xpu/test_gaussian_random_op_xpu.py
index f457e0056da782..d2bec51113d8fe 100644
--- a/test/xpu/test_gaussian_random_op_xpu.py
+++ b/test/xpu/test_gaussian_random_op_xpu.py
@@ -66,7 +66,7 @@ def setUp(self):
                 "mean": self.mean,
                 "std": self.std,
                 "seed": 10,
-                "use_mkldnn": self.use_onednn,
+                "use_onednn": self.use_onednn,
                 "dtype": typeid_dict[self.in_type_str],
             }
             paddle.seed(10)
@@ -119,7 +119,7 @@ def setUp(self):
                 'mean': self.mean,
                 'std': self.std,
                 'seed': self.seed,
-                'use_mkldnn': self.use_onednn,
+                'use_onednn': self.use_onednn,
                 "dtype": typeid_dict[self.in_type_str],
             }
 
@@ -185,7 +185,7 @@ def setUp(self):
                 'mean': self.mean,
                 'std': self.std,
                 'seed': self.seed,
-                'use_mkldnn': self.use_onednn,
+                'use_onednn': self.use_onednn,
                 "dtype": typeid_dict[self.in_type_str],
             }
             self.outputs = {'Out': np.zeros((123, 92), dtype=self.dtype)}
diff --git a/test/xpu/test_pool2d_op_xpu.py b/test/xpu/test_pool2d_op_xpu.py
index 1aab84bc6f11b6..a5cc545e7e7d22 100644
--- a/test/xpu/test_pool2d_op_xpu.py
+++ b/test/xpu/test_pool2d_op_xpu.py
@@ -331,7 +331,7 @@ def setUp(self):
                 'pooling_type': self.pool_type,
                 'global_pooling': self.global_pool,
                 'use_cudnn': self.use_cudnn,
-                'use_mkldnn': self.use_onednn,
+                'use_onednn': self.use_onednn,
                 'data_format': self.data_format,
                 'exclusive': self.exclusive,
                 'adaptive': self.adaptive,
diff --git a/test/xpu/test_transpose_op_xpu.py b/test/xpu/test_transpose_op_xpu.py
index 8188984165969e..c46b7174b5def1 100644
--- a/test/xpu/test_transpose_op_xpu.py
+++ b/test/xpu/test_transpose_op_xpu.py
@@ -40,7 +40,7 @@ def setUp(self):
             self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)}
             self.attrs = {
                 'axis': list(self.axis),
-                'use_mkldnn': False,
+                'use_onednn': False,
                 'use_xpu': True,
             }
             self.outputs = {
@@ -156,7 +156,7 @@ def setUp(self):
             }
             self.attrs = {
                 'axis': list(self.axis),
-                'use_mkldnn': False,
+                'use_onednn': False,
                 'use_xpu': True,
             }
             self.outputs = {
diff --git a/tools/xpu/pack_paddle_dependence.sh b/tools/xpu/pack_paddle_dependence.sh
index 0cbc258b0f7610..3996d5cd76b067 100644
--- a/tools/xpu/pack_paddle_dependence.sh
+++ b/tools/xpu/pack_paddle_dependence.sh
@@ -94,10 +94,8 @@ function xhpc_prepare() {
     cp -r ${XHPC_DIR_NAME}/xpudnn/so/libxpu_dnn.so xpu/lib/
 
     if [[ "${WITH_MKL}" == "ON" ]]; then
-      cp -r ${BUILD_DIR}/third_party/install/mklml/lib/libiomp5.so xpu/lib/
-      pushd xpu/lib
-      ln -sf libiomp5.so libomp.so
-      popd
+      # Now xpu/lib/libomp.so is invalid. When we need libomp.so, libomp.so is valid.
+      ln -sf ${BUILD_DIR}/third_party/install/mklml/lib/libiomp5.so xpu/lib/libomp.so
     else
       cp -r ${XHPC_DIR_NAME}/xpudnn/so/libomp.so xpu/lib/
       pushd xpu/lib
@@ -160,10 +158,8 @@ function local_assemble() {
       cp -r ${LOCAL_PATH}/${XHPC_DIR_NAME}/xpudnn/so/libxpu_dnn.so xpu/lib/
 
       if [[ "${WITH_MKL}" == "ON" ]]; then
-        cp -r ${BUILD_DIR}/third_party/install/mklml/lib/libiomp5.so xpu/lib/
-        pushd xpu/lib
-        ln -sf libiomp5.so libomp.so
-        popd
+        # Now xpu/lib/libomp.so is invalid. When we need libomp.so, libomp.so is valid.
+        ln -sf ${BUILD_DIR}/third_party/install/mklml/lib/libiomp5.so xpu/lib/libomp.so
       else
         cp -r ${XHPC_DIR_NAME}/xpudnn/so/libomp.so xpu/lib/
         pushd xpu/lib