diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 2e0d021f102a55..bb60cca94f3d76 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -253,9 +253,9 @@ if(WITH_XPU_XRE5) DOWNLOAD_COMMAND bash ${CMAKE_SOURCE_DIR}/tools/xpu/pack_paddle_dependence.sh ${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XHPC_URL} ${XPU_XHPC_DIR_NAME} - ${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME} 1 ${WITH_MKL} - "${CMAKE_SOURCE_DIR}/build" && wget ${XPU_XFT_GET_DEPENCE_URL} && bash - ${XFT_COMMAND} ${XPU_XFT_URL} ${XPU_XFT_DIR_NAME} && bash + ${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME} 1 ${WITH_MKL} "${CMAKE_BINARY_DIR}" + && wget ${XPU_XFT_GET_DEPENCE_URL} && bash ${XFT_COMMAND} ${XPU_XFT_URL} + ${XPU_XFT_DIR_NAME} && bash ${CMAKE_SOURCE_DIR}/tools/xpu/get_xpti_dependence.sh ${XPU_XPTI_URL} ${XPU_XPTI_DIR_NAME} && bash ${CMAKE_SOURCE_DIR}/tools/xpu/get_xpufft_dependence.sh ${XPU_FFT_URL} diff --git a/paddle/cinn/common/integer_set.cc b/paddle/cinn/common/integer_set.cc index a4fa9ecbae1afe..efe54a192a7d2a 100644 --- a/paddle/cinn/common/integer_set.cc +++ b/paddle/cinn/common/integer_set.cc @@ -164,7 +164,8 @@ cas_intervals_t CollectVarIntervalsOfExprs(const std::vector& exprs, lower_bound = ir::Expr(1); } var_intervals.insert( - {var->name, CasInterval(lower_bound, upper_bound)}); + {var->name, + CasInterval(lower_bound, NormalizeUpperBound(upper_bound))}); } return false; }); @@ -572,6 +573,9 @@ class BoundReplacer : public ir::IRMutator<> { ir::Expr SymbolicExprAnalyzer::LowerBound(const ir::Expr& expr) const { BoundReplacer bound_replacer(var_intervals_, true); ir::Expr bound = ir::ir_utils::IRCopy(expr); + if (bound.is_index()) { + bound = bound.as_index().Normalize(ir::IndexExpr::OptLevel::kLevel3); + } bound_replacer(&bound); return optim::ArithSimplify(bound); } @@ -579,7 +583,11 @@ ir::Expr SymbolicExprAnalyzer::LowerBound(const ir::Expr& expr) const { ir::Expr SymbolicExprAnalyzer::UpperBound(const ir::Expr& expr) const { BoundReplacer bound_replacer(var_intervals_, false); ir::Expr bound = ir::ir_utils::IRCopy(expr); + if (bound.is_index()) { + bound = bound.as_index().Normalize(ir::IndexExpr::OptLevel::kLevel3); + } bound_replacer(&bound); + return optim::ArithSimplify(bound); } @@ -709,7 +717,8 @@ SingleIntervalIntSet::SingleIntervalIntSet(const ir::Expr& min, ? x->as_var()->upper_bound : SymbolicExprLimit::positive_inf; var_intervals_.insert( - {x->as_var()->name, CasInterval(lower_bound, upper_bound)}); + {x->as_var()->name, + CasInterval(lower_bound, NormalizeUpperBound(upper_bound))}); } return false; }; diff --git a/paddle/cinn/common/ir_util.cc b/paddle/cinn/common/ir_util.cc index 32f967f09d7759..710d6a2c85f2a1 100644 --- a/paddle/cinn/common/ir_util.cc +++ b/paddle/cinn/common/ir_util.cc @@ -270,6 +270,16 @@ bool is_zero(Expr v) { return false; } +Expr NormalizeUpperBound(Expr upper_bound, bool minus_one /* = true */) { + if (upper_bound == SymbolicExprLimit::positive_inf) { + return upper_bound; + } + if (minus_one) { + return upper_bound - ir::Expr(1); // [lower, upper) to [lower, upper] + } + return upper_bound + ir::Expr(1); // (lower, upper] to [lower, upper) +} + Expr CastIfNeeded(Expr body, Type type) { if (body.type() == type) return body; return ir::Cast::Make(type, body); diff --git a/paddle/cinn/common/ir_util.h b/paddle/cinn/common/ir_util.h index bbc81c2b64e5d3..d4486a052b9e70 100644 --- a/paddle/cinn/common/ir_util.h +++ b/paddle/cinn/common/ir_util.h @@ -91,6 +91,8 @@ std::vector GatherItersToTensorProducer( bool is_zero(Expr v); +Expr NormalizeUpperBound(Expr upper_bound, bool minus_one = true); + bool MathEqual(const Expr &a, const Expr &b); //! helper function to get a ir::Select node. diff --git a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc index 6cbdfef7b11333..d59d77954934ce 100644 --- a/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc +++ b/paddle/cinn/ir/group_schedule/dy_shape_group_scheduler.cc @@ -136,7 +136,7 @@ std::vector> DynamicShapeGroupScheduler::GetCX86IRs() { std::vector> irs(1); irs[0].first = ir::EQ::Make(ir::Expr(1), ir::Expr(1)); - irs[1].second = ir_sch_->GetModule().GetExprs()[0]; + irs[0].second = ir_sch_->GetModule().GetExprs()[0]; return irs; } diff --git a/paddle/cinn/ir/group_schedule/tactic/arrange_storage_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/arrange_storage_tactic.cc index 333846d6740568..2327d2f3aeeddd 100644 --- a/paddle/cinn/ir/group_schedule/tactic/arrange_storage_tactic.cc +++ b/paddle/cinn/ir/group_schedule/tactic/arrange_storage_tactic.cc @@ -141,7 +141,8 @@ IntSet Evaluate(Expr expr, const std::unordered_map& var_domain) { Expr copy_for_upper_bound = ir::ir_utils::IRCopy(expr); Expr copy_for_lower_bound = ir::ir_utils::IRCopy(expr); - common::cas_intervals_t var_intervals; + common::cas_intervals_t + var_intervals; // variable name -> CasIntervals[lower_bound, upper_bound] std::vector var_vec = ir::ir_utils::CollectIRNodesWithoutTensor( expr, [](const ir::Expr* x) { return x->as_var(); }); for (Expr var_expr : var_vec) { @@ -150,7 +151,9 @@ IntSet Evaluate(Expr expr, const ir::Var& fixed_var = fixed.at(var); var_intervals.emplace( fixed_var->name, - common::CasInterval(fixed_var->lower_bound, fixed_var->upper_bound)); + common::CasInterval( + fixed_var->lower_bound, + cinn::common::NormalizeUpperBound(fixed_var->upper_bound))); optim::ReplaceVarWithExpr(©_for_lower_bound, var, Expr(fixed_var)); optim::ReplaceVarWithExpr(©_for_upper_bound, var, Expr(fixed_var)); } else if (var_domain.count(var) != 0) { @@ -172,7 +175,8 @@ IntSet Evaluate(Expr expr, ::common::errors::InvalidArgument( "The 'upper_bound' of the variable must be defined.")); optim::ReplaceVarWithExpr(©_for_lower_bound, var, var->lower_bound); - optim::ReplaceVarWithExpr(©_for_upper_bound, var, var->upper_bound); + optim::ReplaceVarWithExpr( + ©_for_upper_bound, var, NormalizeUpperBound(var->upper_bound)); } } ir::Expr lower_bound = optim::ArithSimplify(copy_for_lower_bound); diff --git a/paddle/cinn/ir/ir.h b/paddle/cinn/ir/ir.h index 7acf4e110cde2d..cf8b58cd6b57f7 100644 --- a/paddle/cinn/ir/ir.h +++ b/paddle/cinn/ir/ir.h @@ -421,6 +421,7 @@ struct _Var_ : public ExprNode<_Var_> { }; //! A named variable. +// i ∈ [lower_bound, upper_bound) struct Var : public IrNodeRef { Var() = default; explicit Var(IrNode* n) : IrNodeRef(n) {} @@ -846,6 +847,7 @@ struct For : public ExprNode, public ForBase { //! The minimum value of the iteration. Expr min; //! The extent of the iteration. + // loop_var ∈ [min, min + extent) Expr extent; Expr body; diff --git a/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc b/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc index 492738516e95a7..860d285b242aa6 100644 --- a/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc +++ b/paddle/cinn/ir/ir_analyzer/ir_analyzer.cc @@ -621,7 +621,8 @@ std::vector IndicesToVars(const std::vector& indices) { if (e.is_constant()) { std::string var_name = cinn::UniqName("constant" + static_cast(e.get_constant())); - result.emplace_back(e, e, var_name, /* is_reduce = */ false); + result.emplace_back( + e, NormalizeUpperBound(e, false), var_name, /* is_reduce = */ false); } else if (e.As() != nullptr) { ir::Expr copy_e = ir::ir_utils::IRCopy(e); ir::_Var_* var_ref = copy_e.As(); @@ -635,14 +636,17 @@ std::vector IndicesToVars(const std::vector& indices) { ir::Var var = x->as_var_ref(); var_intervals.insert( {var->name, - common::CasInterval{var->lower_bound, var->upper_bound}}); + common::CasInterval{var->lower_bound, + NormalizeUpperBound(var->upper_bound)}}); if (var->is_reduce_axis) is_reduce = true; } return false; }); common::SymbolicExprAnalyzer analyzer(var_intervals); - result.emplace_back( - analyzer.LowerBound(e), analyzer.UpperBound(e), var_name, is_reduce); + result.emplace_back(analyzer.LowerBound(e), + NormalizeUpperBound(analyzer.UpperBound(e), false), + var_name, + is_reduce); } } return result; diff --git a/paddle/cinn/optim/ir_simplify.cc b/paddle/cinn/optim/ir_simplify.cc index f1b5d3dfc9f381..1457b61528976a 100644 --- a/paddle/cinn/optim/ir_simplify.cc +++ b/paddle/cinn/optim/ir_simplify.cc @@ -386,6 +386,296 @@ struct SimplifySelectMutator : public ir::IRMutator<> { } }; +/* +Example 1: + Select(a <= b, b, a) → max(a, b) +Example 2: + Select(a <= b, a, b) → min(a, b) +Example 3: + Select(a <= MAX, max(a, MIN), MAX) → min(max(a, MIN), MAX) + Select(a <= MAX, max(MIN, a), MAX) → min(max(a, MIN), MAX) +Example 4: + Select(MIN <= b, min(b, MAX), MIN) → max(min(b, MAX), MIN) + → min(max(b, MIN), MAX) + Select(MIN <= b, min(MAX, b), MIN) → max(min(b, MAX), MIN) + → min(max(b, MIN), MAX) +*/ +struct SimplifySelect2MinMaxMutator : public ir::ExprMutator<> { + void operator()(Expr* x) { ir::ExprMutator<>::Visit(x, x); } + + using ir::ExprMutator<>::Visit; + + // Recursively optimize CompareOp operands + template + void VisitCompare(T* op, Expr* expr) { + Expr a = op->a(); + Expr b = op->b(); + ir::ExprMutator<>::Visit(&a, &a); + ir::ExprMutator<>::Visit(&b, &b); + + if (a.get() != op->a().get() || b.get() != op->b().get()) { + *expr = T::Make(a, b); + } + } + + void Visit(const ir::GE* op, Expr* expr) override { VisitCompare(op, expr); } + void Visit(const ir::GT* op, Expr* expr) override { VisitCompare(op, expr); } + void Visit(const ir::LE* op, Expr* expr) override { VisitCompare(op, expr); } + void Visit(const ir::LT* op, Expr* expr) override { VisitCompare(op, expr); } + + void Visit(const Select* op, Expr* expr) override { + auto* node = expr->As(); + + // 1. Recursively optimize sub-expressions + Expr condition = node->condition; + Expr true_value = node->true_value; + Expr false_value = node->false_value; + + ir::ExprMutator<>::Visit(&condition, &condition); + ir::ExprMutator<>::Visit(&true_value, &true_value); + ir::ExprMutator<>::Visit(&false_value, &false_value); + + // 2. If sub-expressions are modified, rebuild the Select node + if (condition.get() != node->condition.get() || + true_value.get() != node->true_value.get() || + false_value.get() != node->false_value.get()) { + *expr = ir::Select::Make(condition, true_value, false_value); + node = expr->As(); + } + + // 3. Function to optimize Select into Min/Max when possible + auto TryOptimizeSelect = [&](const Expr& a, + const Expr& b, + const Expr& x, + const Expr& y) -> Expr { + // Case 1: Select(a <= b, b, a) → max(a, b) + if (x == b && y == a) { + if (b.is_constant()) { + return ir::Max::Make(a, b); + } else { + return ir::Max::Make(b, a); + } + } + // Case 2: Select(a <= b, a, b) → min(a, b) + if (x == a && y == b) { + if (b.is_constant()) { + return ir::Min::Make(a, b); + } else { + return ir::Min::Make(b, a); + } + } + // Case 3: Select(a <= MAX, max(a, MIN), MAX) → min(max(a, MIN), MAX) + if (auto* max = x.As()) { + if (max->a() == a) { + if (max->b().is_constant() && y.is_constant() && b.is_constant()) { + if (y.get_constant() == b.get_constant() && + (max->b()).get_constant() <= y.get_constant()) { + return ir::Min::Make(ir::Max::Make(a, max->b()), b); + } + } + } else if (max->b() == a) { + // Select(a <= MAX, max(MIN, a), MAX) → min(max(a, MIN), MAX) + if (max->a().is_constant() && y.is_constant() && b.is_constant()) { + if (y.get_constant() == b.get_constant() && + (max->a()).get_constant() <= y.get_constant()) { + return ir::Min::Make(ir::Max::Make(a, max->a()), b); + } + } + } + } + // Case 4: Select(MIN <= b, min(b, Max), MIN) → max(min(b, MAX), MIN) + // → min(max(b, MIN), MAX) + if (auto* min = x.As()) { + if (min->a() == b) { + if ((min->b()).is_constant() && y.is_constant() && a.is_constant()) { + if (y.get_constant() == a.get_constant() && + y.get_constant() <= (min->b()).get_constant()) { + return ir::Min::Make(ir::Max::Make(b, a), min->b()); + } + } + } else if (min->b() == b) { + // Select(MIN <= b, min(Max, b), MIN) → min(max(b, MIN), MAX) + if ((min->a()).is_constant() && y.is_constant() && a.is_constant()) { + if (y.get_constant() == a.get_constant() && + y.get_constant() <= (min->a()).get_constant()) { + return ir::Min::Make(ir::Max::Make(b, a), min->a()); + } + } + } + } + return Expr(nullptr); + }; + + // 4. Try to optimize different comparison conditions by converting them to + // <= logic + if (auto* ge = node->condition.As()) { + // Select(a >= b, t, f) → Select(b <= a, t, f) + Expr optimized = TryOptimizeSelect( + ge->b(), ge->a(), node->true_value, node->false_value); + if (optimized.defined()) { + *expr = optimized; + return; + } + } else if (auto* gt = node->condition.As()) { + // Select(a > b, t, f) → Select(a <= b, f, t) + Expr optimized = TryOptimizeSelect( + gt->a(), gt->b(), node->false_value, node->true_value); + if (optimized.defined()) { + *expr = optimized; + return; + } + } else if (auto* le = node->condition.As()) { + // Select(a <= b, t, f) → Select(a <= b, t, f) + Expr optimized = TryOptimizeSelect( + le->a(), le->b(), node->true_value, node->false_value); + if (optimized.defined()) { + *expr = optimized; + return; + } + } else if (auto* lt = node->condition.As()) { + // Select(a < b, t, f) → Select(b <= a, f, t) + Expr optimized = TryOptimizeSelect( + lt->b(), lt->a(), node->false_value, node->true_value); + if (optimized.defined()) { + *expr = optimized; + return; + } + } + } +}; + +// Optimizes pow(2.0f, ceil(log2(x))) pattern into more efficient bit +// manipulation: +// Original: pow(2.0f, ceil(log2(x))) +// Optimized: ldexpf(1.0f, exponent) where exponent is calculated via: +// 1. float_as_uint(x) - reinterpret float as uint32 +// 2. right_shift(bits, 23) - extract exponent field +// 3. (exponent_raw & 0xFF) - 127 - adjust IEEE754 bias +// 4. +1 if mantissa is non-zero (for ceil behavior) +struct SimplifyPowerCeilLog2BitOpLdexpfMutator : public ir::ExprMutator<> { + void operator()(Expr* expr) { ir::ExprMutator<>::Visit(expr, expr); } + + using ir::ExprMutator<>::Visit; + void Visit(const ir::Call* op, Expr* expr) override { + /// 1. First recursively process all sub-expressions + std::vector new_args; + for (const auto& arg : op->read_args) { + Expr new_arg = arg; + Visit(&new_arg, &new_arg); + new_args.push_back(new_arg); + } + + // 2. Match target pattern: pow(base, ceil(log2(x))) + if (op->name == "pow" && new_args.size() == 2) { + const Expr& base = new_args[0]; + const Expr& exponent = new_args[1]; + + // Check if exponent is ceil(log2(x)) + if (const ir::Call* ceil_call = exponent.As()) { + if (ceil_call->name == "ceil" && ceil_call->read_args.size() == 1) { + if (const ir::Call* log2_call = + ceil_call->read_args[0].As()) { + if (log2_call->name == "log2" && log2_call->read_args.size() == 1 && + log2_call->read_args[0].type().is_float(32)) { + /// Verify base is 2.0f for optimization + bool is_base_two = false; + if (base.is_constant()) { + if (base.get_constant() == 2.0f) { + is_base_two = true; + } + } + if (is_base_two) { + // 3. Replace with bit operations + ldexpf + Expr x = log2_call->read_args[0]; // Extract log2's argument + + // Create bit operations to compute ceil(log2(x)) + // (1) Reinterpret float as 32-bit integer + Expr bits = ir::Call::Make(common::Int(32), + "__float_as_uint", + {x}, + {}, + ir::CallType::Extern, + ir::FunctionRef(), + 0, + {}); + + std::vector shift_r_args = {bits, ir::Expr(23)}; + Expr shift_r = ir::Call::Make(common::Int(32), + "right_shift", + shift_r_args, + {}, + ir::CallType::Extern, + ir::FunctionRef(), + 0, + {}); + // (2) Extract exponent part: ((bits >> 23) & 0xFF) - 127 + std::vector bitwise_and_exp_args = { + shift_r, ir::Expr(0xFF)}; + Expr bitwise_and_exp = ir::Call::Make(common::Int(32), + "bitwise_and", + bitwise_and_exp_args, + {}, + ir::CallType::Extern, + ir::FunctionRef(), + 0, + {}); + Expr exponent_raw = + ir::Sub::Make(bitwise_and_exp, ir::Expr(127)); + // 3. Check if mantissa is non-zero (i.e., if exponent+1 is + // needed) + std::vector bitwise_and_tail_args = { + bits, ir::Expr(0x007FFFFF)}; + Expr bitwise_and_tail = ir::Call::Make(common::Int(32), + "bitwise_and", + bitwise_and_tail_args, + {}, + ir::CallType::Extern, + ir::FunctionRef(), + 0, + {}); + Expr mantissa_non_zero = + ir::NE::Make(bitwise_and_tail, ir::Expr(0)); + // (4) Check if it's a normal number (exponent != -127) + Expr is_normal = ir::NE::Make(exponent_raw, ir::Expr(-127)); + // (5) If needed, exponent += 1 + Expr exponent_final = ir::Add::Make( + exponent_raw, + ir::Select::Make( + ir::And::Make(is_normal, mantissa_non_zero), + ir::Expr(1), + ir::Expr(0))); + // (6) Create final expression: ldexpf(1.0f, exponent_final) + Expr new_expr = ir::Call::Make(op->type(), + "ldexpf", + {ir::Expr(1.0f), exponent_final}, + {}, + ir::CallType::Extern, + ir::FunctionRef(), + 0, + {}); + *expr = new_expr; + return; + } + } + } + } + } + } + + // For non-target patterns, reconstruct as-is + if (new_args != op->read_args) { + *expr = ir::Call::Make(op->type(), + op->name, + new_args, + op->write_args, + op->call_type, + op->func, + op->value_index, + op->attrs); + } + } +}; + struct SimplifyUnitBlockMutator : public ir::ExprMutator<> { void operator()(Expr* x) { ir::ExprMutator::Visit(x, x); } @@ -498,6 +788,8 @@ void Simplify(Expr* expr) { SimplifyLogicalMutator()(expr); SimplifyIfThenElseMutator()(expr); SimplifySelectMutator()(expr); + SimplifySelect2MinMaxMutator()(expr); + SimplifyPowerCeilLog2BitOpLdexpfMutator()(expr); SimplifyNoPureMathMutator()(expr); VLOG(6) << "End Simplify " << *expr; } diff --git a/paddle/cinn/optim/simplify_util.cc b/paddle/cinn/optim/simplify_util.cc index 0c02ff5ce9bb89..5fa37a3ccc3d01 100644 --- a/paddle/cinn/optim/simplify_util.cc +++ b/paddle/cinn/optim/simplify_util.cc @@ -677,8 +677,124 @@ std::optional> MatchPattern( return std::nullopt; } +/*! + * \brief Optimize linear division and modulo operations with constant + * denominators. + * + * This function handles linear expressions of the form + * `(a * C1 + b) / C2` and `(a * C1 + b) % C2` + * where C1 and C2 are constants. It specifically targets: + * 1. Linear combinations in the numerator (sums of terms) + * 2. Constant denominators + * + * The optimization: + * 1. Separates terms divisible by the denominator (linear coefficients) + * 2. Groups remaining terms as a remainder expression + * 3. For division: + * - Returns the sum of divisible terms if remainder < denominator + * - Otherwise preserves the original division + * 4. For modulo: + * - Returns the remainder if it's provably smaller than denominator + * - Otherwise preserves the original modulo + * + * Example linear optimizations: + * 1. Linear division: (x * 8 + y * 4 + 3) / 4 → x*2 + y + 0 (when 3 < 4) + * 2. Linear modulo: (x * 8 + y * 4 + 3) % 4 → 0 + 0 + 3 + * 3. Partial division: (x * 6 + 5) / 3 → x * 2 + 5 / 3 (when 5 >= 3) + * + * \param expr The linear division/modulo expression to optimize + * \param ana Symbolic analyzer for proving expression bounds + * \return Simplified expression if provably correct, original otherwise + */ +ir::IndexExpr HandleDivModWithConstants( + const ir::IndexExpr &expr, const common::SymbolicExprAnalyzer &ana) { + // Get numerator and denominator + auto numerator = expr.operand(0); + auto denominator = expr.operand(1); + + // Check if denominator is a constant + if (!denominator.is_constant()) { + return expr; + } + int64_t denom_val = denominator.as_int64(); + + // Recursively expand addition chain and collect all terms + std::vector terms = optim::GetFlattenExprs(numerator); + if (terms.empty()) { + return expr; + } + + // Separate terms that are multiples of denominator from other terms + std::vector multiple_terms; + std::vector remainder_terms; + + for (auto &term : terms) { + if (term.node_type() == ir::IrNodeTy::Mul) { + auto rhs = term.operand(1); + if (rhs.is_constant() && rhs.as_int64() % denom_val == 0) { + // Extract terms divisible by denominator + multiple_terms.push_back( + term.operand(0) * + (rhs.as_int64() / denom_val)); // Extract multiplicand part + continue; + } + } + // Extract terms not divisible by denominator + auto remainder_upper = ana.UpperBound(term); + if (!ana.ProveLT(remainder_upper, denominator).value_or(false)) { + return expr; + } + remainder_terms.push_back(term); + } + + // Build remainder expression + ir::IndexExpr remainder_expr; + if (remainder_terms.empty()) { + remainder_expr = ir::IndexExpr(0); + } else if (remainder_terms.size() == 1) { + remainder_expr = remainder_terms[0]; + } else { + remainder_expr = ir::Add::Make(remainder_terms[0], remainder_terms[1]); + for (size_t i = 2; i < remainder_terms.size(); ++i) { + remainder_expr = ir::Add::Make(remainder_expr, remainder_terms[i]); + } + } + + // Build multiplicand terms expression + ir::IndexExpr multiple_expr; + if (multiple_terms.empty()) { + multiple_expr = ir::IndexExpr(0); + } else if (multiple_terms.size() == 1) { + multiple_expr = multiple_terms[0]; + } else { + multiple_expr = ir::Add::Make(multiple_terms[0], multiple_terms[1]); + for (size_t i = 2; i < multiple_terms.size(); ++i) { + multiple_expr = ir::Add::Make(multiple_expr, multiple_terms[i]); + } + } + + // Verify if remainder range is less than denominator + auto remainder_upper = ana.UpperBound(remainder_expr); + if (!ana.ProveLT(remainder_upper, denominator).value_or(false)) { + // If remainder is greater than denominator, the division result is non-zero + if (expr.node_type() == ir::IrNodeTy::Div) { + return ir::Add::Make(multiple_expr, + ir::Div::Make(remainder_expr, denominator)); + } else { // Modulo operation + return ir::Mod::Make(remainder_expr, denominator); + } + } else { + // If remainder is less than denominator, the division result is zero + if (expr.node_type() == ir::IrNodeTy::Div) { + return multiple_expr; + } else { // Modulo operation + return remainder_expr; + } + } +} + ir::IndexExpr BoundSimplify(const ir::IndexExpr &expr) { - // return expr if expr is not a division or modulo + // Return expr if expr is not a division or modulo if (expr.node_type() != ir::IrNodeTy::Div && expr.node_type() != ir::IrNodeTy::Mod) return expr; @@ -686,10 +802,10 @@ ir::IndexExpr BoundSimplify(const ir::IndexExpr &expr) { common::cas_intervals_t var_intervals = common::CollectVarIntervalsOfExprs({expr}); common::SymbolicExprAnalyzer ana(var_intervals); - // Because the SymbolicExprAnalyzer bound result is [lower, upper), `ProveLE` - // is used here instead of `ProveLT`. + // Because the SymbolicExprAnalyzer bound result is [lower, upper], + // `ProveLT` is used here instead of `ProveLE`. auto canBeSimplified = - ana.ProveLE(ana.UpperBound(expr.operand(0)), expr.operand(1)); + ana.ProveLT(ana.UpperBound(expr.operand(0)), expr.operand(1)); if (canBeSimplified.value_or(false)) { if (expr.node_type() == ir::IrNodeTy::Div) { @@ -698,7 +814,8 @@ ir::IndexExpr BoundSimplify(const ir::IndexExpr &expr) { return expr.operand(0); } } - return expr; + + return HandleDivModWithConstants(expr, ana); } ir::IndexExpr BroadcastSimplify(const ir::IndexExpr &expr) { diff --git a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h index 7962c933db0721..1ca95efbd68678 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h +++ b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h @@ -19,21 +19,29 @@ #include "paddle/phi/core/distributed/auto_parallel/placement_types.h" #include "paddle/phi/core/distributed/auto_parallel/process_mesh.h" -paddle::Tensor add_n_ad_func(const std::vector& x); +paddle::Tensor add_n_ad_func( + const std::vector& x, + paddle::optional input_out = paddle::none); -paddle::Tensor conv2d_ad_func(const paddle::Tensor& input, - const paddle::Tensor& filter, - std::vector strides, - std::vector paddings, - std::string padding_algorithm, - std::vector dilations, - int groups, - std::string data_format); +paddle::Tensor conv2d_ad_func( + const paddle::Tensor& input, + const paddle::Tensor& filter, + std::vector strides, + std::vector paddings, + std::string padding_algorithm, + std::vector dilations, + int groups, + std::string data_format, + paddle::optional input_out = paddle::none); -paddle::Tensor multiply_ad_func(const paddle::Tensor& x, - const paddle::Tensor& y); -paddle::Tensor& multiply__ad_func(paddle::Tensor& x, // NOLINT - const paddle::Tensor& y); +paddle::Tensor multiply_ad_func( + const paddle::Tensor& x, + const paddle::Tensor& y, + paddle::optional input_out = paddle::none); +paddle::Tensor& multiply__ad_func( + paddle::Tensor& x, // NOLINT + const paddle::Tensor& y, + paddle::optional input_out = paddle::none); std::tuple input_out = paddle::none); paddle::Tensor dtensor_to_local_ad_function( const paddle::Tensor& input, const phi::distributed::ProcessMesh& processmesh, - const phi::distributed::Placements& placements); + const phi::distributed::Placements& placements, + paddle::optional input_out = paddle::none); paddle::Tensor dtensor_from_local_ad_function( const paddle::Tensor& input, const phi::distributed::ProcessMesh& processmesh, - const phi::distributed::Placements& placements); + const phi::distributed::Placements& placements, + paddle::optional input_out = paddle::none); namespace sparse { std::tuple& x) { +paddle::Tensor add_n_ad_func(const std::vector& x, + paddle::optional input_out) { VLOG(3) << "Running AD API: " << "add_n"; if (FLAGS_check_cuda_error) [[unlikely]] { diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc index 893249fddc904a..0aea3ba196798f 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc @@ -31,7 +31,8 @@ paddle::Tensor conv2d_ad_func(const paddle::Tensor& input, std::string padding_algorithm, std::vector dilations, int groups, - std::string data_format) { + std::string data_format, + paddle::optional input_out) { VLOG(3) << "Running AD API: " << "conv2d"; if (FLAGS_check_cuda_error) [[unlikely]] { diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/dtensor_from_local_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/dtensor_from_local_fwd_func.cc index 8fa3b0a11a3cfd..4a06c524dc194d 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/dtensor_from_local_fwd_func.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/dtensor_from_local_fwd_func.cc @@ -25,7 +25,8 @@ COMMON_DECLARE_bool(check_cuda_error); paddle::Tensor dtensor_from_local_ad_function( const paddle::Tensor& input, const phi::distributed::ProcessMesh& process_mesh, - const phi::distributed::Placements& placements) { + const phi::distributed::Placements& placements, + paddle::optional input_out) { #ifdef PADDLE_WITH_DISTRIBUTE VLOG(3) << "Running AD API: " << "dtensor_from_local dygraph"; diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/dtensor_to_local_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/dtensor_to_local_fwd_func.cc index 02d8f368e37953..be18aea8abd79d 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/dtensor_to_local_fwd_func.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/dtensor_to_local_fwd_func.cc @@ -23,7 +23,8 @@ COMMON_DECLARE_bool(check_cuda_error); paddle::Tensor dtensor_to_local_ad_function( const paddle::Tensor& input, const phi::distributed::ProcessMesh& process_mesh, - const phi::distributed::Placements& placements) { + const phi::distributed::Placements& placements, + paddle::optional input_out) { #ifdef PADDLE_WITH_DISTRIBUTE VLOG(3) << "Running AD API: " << "dtensor_to_local dygraph"; diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc index def887365f3246..4c03ee6ef486b1 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc @@ -38,7 +38,8 @@ bool check_if_support_elementwise_mul_mem_opt(const std::string& device_type) { } paddle::Tensor multiply_ad_func(const paddle::Tensor& x, - const paddle::Tensor& y) { + const paddle::Tensor& y, + paddle::optional input_out) { FLAGS_tensor_operants_mode = "eager"; VLOG(3) << "Running AD API: " << "multiply"; @@ -241,7 +242,8 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x, } paddle::Tensor& multiply__ad_func(paddle::Tensor& x, // NOLINT - const paddle::Tensor& y) { + const paddle::Tensor& y, + paddle::optional input_out) { FLAGS_tensor_operants_mode = "eager"; VLOG(3) << "Running AD API: " << "multiply_"; diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/reshard_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/reshard_fwd_func.cc index 5aaf63d6c8c411..c048a4248c3184 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/reshard_fwd_func.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/reshard_fwd_func.cc @@ -22,7 +22,8 @@ COMMON_DECLARE_bool(check_cuda_error); paddle::Tensor reshard_ad_function( const paddle::Tensor& input, - const phi::distributed::TensorDistAttr dist_attr) { + const phi::distributed::TensorDistAttr dist_attr, + paddle::optional input_out) { #ifdef PADDLE_WITH_DISTRIBUTE VLOG(3) << "Running AD API: " << "reshard dygraph"; diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py index 6cc1d9e8ba2b48..ee95ac3da7d3a7 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py @@ -660,6 +660,7 @@ class {} : public egr::GradNodeBase {{ #include "paddle/fluid/framework/op_registry.h" #include "paddle/utils/test_macros.h" #include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h" +#include "paddle/utils/optional.h" using CPUPlace = phi::CPUPlace; {} {} @@ -1496,7 +1497,7 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False): self.grad_node_out_list = grad_node_out_list - def run(self): + def run(self, append_input_out=False): # Basic Validation Check self.DygraphYamlValidationCheck() @@ -1684,7 +1685,9 @@ def GenerateForwardLayoutAutotune( return layout_logic_str - def GenerateForwardDefinitionAndDeclaration(self, is_inplaced, grad_flag): + def GenerateForwardDefinitionAndDeclaration( + self, is_inplaced, grad_flag, append_input_out + ): namespace = self.namespace if self.forward_api_name[-1] == '_' and not is_inplaced: return @@ -1881,6 +1884,24 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced, grad_flag): inputs_args_declaration_str = ", ".join(inputs_args_declaration_list) inputs_args_definition_str = ", ".join(inputs_args_definition_list) + if ( + append_input_out + and not grad_flag + and not is_inplaced + and len(self.forward_outputs_position_map) == 1 + and next(iter(self.forward_outputs_position_map.values()))[0] + == "Tensor" + and forward_api_name != "empty_like" + ): + inputs_args_declaration_str = ( + inputs_args_declaration_str + + ", paddle::optional input_out = paddle::none" + ) + inputs_args_definition_str = ( + inputs_args_definition_str + + ", paddle::optional input_out" + ) + inputs_call_list.append("input_out") inputs_call_args_str = ", ".join(inputs_call_list) self.inputs_call_list = inputs_call_list @@ -2135,6 +2156,16 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced, grad_flag): + " ".join(amp_autocast_optional_list) ) amp_inputs_call_args_str = ", ".join(amp_inputs_call_list) + if ( + append_input_out + and not grad_flag + and not is_inplaced + and len(self.forward_outputs_position_map) == 1 + and next(iter(self.forward_outputs_position_map.values()))[0] + == "Tensor" + and forward_api_name != "empty_like" + ): + amp_inputs_call_args_str = amp_inputs_call_args_str + ", input_out" amp_call_str = ( f"return {forward_ad_function_name}({amp_inputs_call_args_str});" ) @@ -2158,6 +2189,18 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced, grad_flag): type_promote_inputs_call_args_str = ", ".join( type_promote_inputs_call_list ) + if ( + append_input_out + and not grad_flag + and not is_inplaced + and len(self.forward_outputs_position_map) == 1 + and next(iter(self.forward_outputs_position_map.values()))[0] + == "Tensor" + and forward_api_name != "empty_like" + ): + type_promote_inputs_call_args_str = ( + type_promote_inputs_call_args_str + ", input_out" + ) type_promote_call_list = f"return {forward_ad_function_name}({type_promote_inputs_call_args_str});" x_cast = ( @@ -2180,6 +2223,19 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced, grad_flag): type_promote_inputs_call_args_str = ", ".join( type_promote_inputs_call_list ) + if ( + append_input_out + and not grad_flag + and not is_inplaced + and len(self.forward_outputs_position_map) == 1 + and next(iter(self.forward_outputs_position_map.values()))[0] + == "Tensor" + and forward_api_name != "empty_like" + ): + type_promote_inputs_call_args_str = ( + type_promote_inputs_call_args_str + ", input_out" + ) + type_promote_call_list = f"return {forward_ad_function_name}({type_promote_inputs_call_args_str});" x_cast = ( @@ -2323,7 +2379,9 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced, grad_flag): self.forward_declaration_str += f"TEST_API {returns_type_str} {forward_ad_function_name}({inputs_args_declaration_str});\n" - def GenerateInplacedForwardDygraphFunctions(self, grad_flag): + def GenerateInplacedForwardDygraphFunctions( + self, grad_flag, append_input_out + ): # Inplaced Version Dygraph Function Generation forward_api_name = self.forward_api_name forward_api_contents = self.forward_api_contents @@ -2331,7 +2389,9 @@ def GenerateInplacedForwardDygraphFunctions(self, grad_flag): if forward_api_name != "sum" and "inplace" in forward_api_contents: # Function Definition and Declaration Generation self.GenerateForwardDefinitionAndDeclaration( - is_inplaced=True, grad_flag=grad_flag + is_inplaced=True, + grad_flag=grad_flag, + append_input_out=append_input_out, ) self.UpdateCoreOpsInformation(is_inplaced=True) @@ -2367,8 +2427,8 @@ def UpdateCoreOpsInformation(self, is_inplaced): for name, (ttype, pos) in forward_outputs_position_map.items(): core_ops_returns_info[fwd_api_name][pos] = name - def run(self, grad_flag=False): - super().run() + def run(self, grad_flag=False, append_input_out=False): + super().run(append_input_out=append_input_out) ################### # Code Generation # @@ -2376,12 +2436,16 @@ def run(self, grad_flag=False): # Definition And Declaration self.GenerateForwardDefinitionAndDeclaration( - is_inplaced=False, grad_flag=grad_flag + is_inplaced=False, + grad_flag=grad_flag, + append_input_out=append_input_out, ) self.UpdateCoreOpsInformation(is_inplaced=False) - self.GenerateInplacedForwardDygraphFunctions(grad_flag) + self.GenerateInplacedForwardDygraphFunctions( + grad_flag, append_input_out=append_input_out + ) class DygraphNodeGenerator(DygraphFunctionGeneratorBase): @@ -3214,8 +3278,8 @@ def _gen_api_call_code_block( returns_str, ) - def run(self): - super().run() + def run(self, append_input_out=False): + super().run(append_input_out=append_input_out) self.ResetOptionalInputs() @@ -3299,7 +3363,7 @@ def GetBackwardAPIContents(self, forward_api_contents): return backward_api_contents - def GenerateCode(self, grad_flag=False): + def GenerateCode(self, grad_flag=False, append_input_out=True): if grad_flag: op_string = 'backward_op' else: @@ -3347,7 +3411,9 @@ def GenerateCode(self, grad_flag=False): forward_apis_dict, namespace, ) - function_generator.run(grad_flag) + function_generator.run( + grad_flag, append_input_out=append_input_out + ) self.forward_definition_str += ( function_generator.forward_definition_str + "\n" @@ -3372,7 +3438,7 @@ def GenerateCode(self, grad_flag=False): namespace, next_grad_api_contents, ) - node_generator.run() + node_generator.run(append_input_out=append_input_out) self.node_declaration_str += ( node_generator.node_declaration_str + "\n" ) @@ -3407,12 +3473,12 @@ def GenerateCode(self, grad_flag=False): namespace, self.node_definition_str ) - def run(self, grad_flag=False): + def run(self, grad_flag=False, append_input_out=False): self.ParseYamlContents() self.InferNameSpace() - self.GenerateCode(grad_flag) + self.GenerateCode(grad_flag, append_input_out=append_input_out) ################ @@ -3521,7 +3587,10 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str, grad_flag): generator = DygraphForwardAndNodesGenerator( api_yaml_path, backward_yaml_path ) - generator.run() + append_input_out = ( + "string" not in api_yaml_path and "sparse" not in api_yaml_path + ) + generator.run(append_input_out=append_input_out) node_declaration_str += generator.node_declaration_str + "\n" node_definition_str += generator.node_definition_str + "\n" @@ -3556,7 +3625,7 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str, grad_flag): backward_yaml_path, backward_yaml_path ) - generator_grad.run(True) + generator_grad.run(True, append_input_out=False) backward_declaration_str += ( generator_grad.forward_declaration_str + "\n" diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py index dc05025ee8d6d6..661427fd069bab 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py @@ -126,6 +126,8 @@ def FindParsingFunctionFromAttributeType(atype): // Get EagerTensors from args {} // Parse Attributes if needed +{} + // Parse input_out if needed {} tstate = PyEval_SaveThread(); @@ -335,7 +337,7 @@ def CollectIsForwardOnly(self): False if 'backward' in forward_api_contents.keys() else True ) - def GeneratePythonCFunction(self): + def GeneratePythonCFunction(self, no_input_out_tensor=False): namespace = self.namespace forward_inplace_map = self.forward_inplace_map forward_api_name = self.forward_api_name @@ -498,6 +500,22 @@ def GeneratePythonCFunction(self): dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_str = ",".join(dygraph_function_call_list) + get_input_out_str = "" + if ( + not no_input_out_tensor + and not forward_inplace_map + and len(self.forward_outputs_position_map) == 1 + and next(iter(self.forward_outputs_position_map.values()))[0] + == "Tensor" + and forward_api_name != "empty_like" + ): + dygraph_function_call_str = ( + dygraph_function_call_str + ", input_out" + ) + get_input_out_str = ( + " auto input_out = GetInputOutTensorFromKwargs(kwargs);" + ) + # Generate Python-C Function Definitions fwd_function_name = FUNCTION_NAME_TEMPLATE.format( "::", namespace, GetForwardFunctionName(forward_api_name) @@ -524,6 +542,7 @@ def GeneratePythonCFunction(self): forward_api_name, get_eager_tensor_str, parse_attributes_str, + get_input_out_str, set_device_str, noamp_dygraph_function_str, return_str, @@ -581,6 +600,7 @@ def GeneratePythonCFunction(self): inplaced_forward_api_name, get_eager_tensor_str, parse_attributes_str, + "", set_device_str, inplace_noamp_dygraph_function_str, return_str, @@ -618,7 +638,7 @@ def GeneratePythonCFunction(self): # Generate Python-C Function Registration self.python_c_function_reg_str += python_c_inplace_func_reg_str - def run(self): + def run(self, no_input_out_tensor=False): # Initialized is_forward_only self.CollectIsForwardOnly() @@ -640,7 +660,7 @@ def run(self): ) # Code Generation - self.GeneratePythonCFunction() + self.GeneratePythonCFunction(no_input_out_tensor) return True @@ -658,7 +678,7 @@ def __init__(self, path): self.python_c_functions_reg_str = "" self.python_c_function_declare_str = "" - def GeneratePythonCFunctions(self): + def GeneratePythonCFunctions(self, no_input_out_tensor=False): namespace = self.namespace forward_api_list = self.forward_api_list @@ -670,7 +690,7 @@ def GeneratePythonCFunctions(self): f_generator = PythonCSingleFunctionGenerator( forward_api_content, namespace ) - status = f_generator.run() + status = f_generator.run(no_input_out_tensor) if status: self.python_c_functions_str += ( @@ -698,7 +718,7 @@ def AttachNamespace(self): ) ) - def run(self): + def run(self, no_input_out_tensor=False): # Infer namespace from yaml_path self.InferNameSpace() @@ -706,7 +726,7 @@ def run(self): self.ParseForwardYamlContents() # Code Generation - self.GeneratePythonCFunctions() + self.GeneratePythonCFunctions(no_input_out_tensor) # Wrap with namespace self.AttachNamespace() @@ -763,8 +783,14 @@ def GeneratePythonCFile(filepath, python_c_str): for i in range(len(api_yaml_paths)): api_yaml_path = api_yaml_paths[i] + no_input_out_tensor = ( + "backward" in api_yaml_path + or "strings" in api_yaml_path + or "sparse" in api_yaml_path + ) + py_c_generator = PythonCGenerator(api_yaml_path) - py_c_generator.run() + py_c_generator.run(no_input_out_tensor) generated_python_c_functions += ( py_c_generator.python_c_functions_str + "\n" diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc index 78e9db9e9b8d68..8d594c10392fb1 100644 --- a/paddle/fluid/framework/ir/is_test_pass_tester.cc +++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc @@ -29,14 +29,14 @@ void SetOp(ProgramDesc* prog, const std::string& name, const std::vector& inputs, const std::vector& outputs, - bool use_mkldnn = false, + bool use_onednn = false, ISTEST_STATE is_test = ISTEST_STATE::UNSET) { auto* op = prog->MutableBlock(0)->AppendOp(); op->SetType(type); op->SetAttr("name", name); op->SetInput("X", inputs); op->SetOutput("Out", outputs); - op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("use_onednn", use_onednn); if (is_test == ISTEST_STATE::UNSET) op->MutableAttrMap()->erase("is_test"); else if (is_test == ISTEST_STATE::FALSE) diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc index 20f130cb37208e..f03d7a160e1048 100644 --- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc @@ -57,7 +57,7 @@ bool validateReduceOpAttrs(const Node* node, EXPECT_TRUE( !PADDLE_GET_CONST(bool, op->GetAttr("reduce_all")), ::paddle::string::Sprintf( - "The LayerNorm fusion %s" + "The LayerNorm fusion %s " "reduction must have \'reduce_all\' attribute set to false.", name)); } diff --git a/paddle/fluid/framework/ir/onednn/activation_onednn_fuse_pass.h b/paddle/fluid/framework/ir/onednn/activation_onednn_fuse_pass.h index cfd4875c73bf3e..f8f0056ff5829f 100644 --- a/paddle/fluid/framework/ir/onednn/activation_onednn_fuse_pass.h +++ b/paddle/fluid/framework/ir/onednn/activation_onednn_fuse_pass.h @@ -59,12 +59,21 @@ inline std::unordered_map GetAttributeMap( inline void SetActivationAttrs(paddle::framework::OpDesc* fused_op, paddle::framework::OpDesc* act_op, const std::string& act_type) { - if (fused_op->HasAttr("use_mkldnn")) { + bool use_mkldnn = false; + if (fused_op->HasAttr("use_mkldnn") && !fused_op->HasAttr("use_onednn")) { PADDLE_ENFORCE(PADDLE_GET_CONST(bool, fused_op->GetAttr("use_mkldnn")), common::errors::PreconditionNotMet( - "oneDNN activation fuses require use_mkldnn=True")); + "oneDNN activation fuses require use_onednn=True")); + } + if (fused_op->HasAttr("use_mkldnn")) { + use_mkldnn = PADDLE_GET_CONST(bool, fused_op->GetAttr("use_mkldnn")); + } + if (!use_mkldnn && fused_op->HasAttr("use_onednn")) { + PADDLE_ENFORCE(PADDLE_GET_CONST(bool, fused_op->GetAttr("use_onednn")), + common::errors::PreconditionNotMet( + "oneDNN activation fuses require use_onednn=True")); } - fused_op->SetAttr("use_mkldnn", true); + fused_op->SetAttr("use_onednn", true); auto attr_map = GetAttributeMap(act_type); for (const auto& attr : attr_map) { diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 5d504c71ff1033..fa0df97f219b27 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -1420,7 +1420,7 @@ struct SimpleOpTypeSetTeller : public Teller { #endif if (dtype != -1 && dtype != 2 && dtype != 3 && dtype != 5 && dtype != 6) { VLOG(3) - << "the fill_any_like only supports int32/int64/float32/float64 by" + << "the fill_any_like only supports int32/int64/float32/float64 by " "trt8.4 below"; return false; } diff --git a/paddle/fluid/operators/collective/c_embedding_op.cc b/paddle/fluid/operators/collective/c_embedding_op.cc index 2c228e5a17775c..182ace60aa7fce 100644 --- a/paddle/fluid/operators/collective/c_embedding_op.cc +++ b/paddle/fluid/operators/collective/c_embedding_op.cc @@ -87,7 +87,7 @@ class CEmbeddingOpMaker : public framework::OpProtoAndCheckerMaker { "and the out-of-bounds will be set to 0 ") .SetDefault(0); AddAttr("vocab_size", - "(int64, default -1), The total vocabulary size to check" + "(int64, default -1), The total vocabulary size to check " "the out-of-bounds ids. If it is -1, no check will be ") .SetDefault(-1); AddComment(R"DOC( diff --git a/paddle/fluid/operators/fused/fused_adam_op.cc b/paddle/fluid/operators/fused/fused_adam_op.cc index 932bdbfd90a6c2..7a890e3e961503 100644 --- a/paddle/fluid/operators/fused/fused_adam_op.cc +++ b/paddle/fluid/operators/fused/fused_adam_op.cc @@ -115,7 +115,7 @@ class FusedAdamOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(0); AddAttr("use_adamw", "(bool, default False) " - "Whether to use AdamW" + "Whether to use AdamW. " "True for decoupled weight decay") .SetDefault(false); AddAttr("multi_precision", diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc index c69e9f98497391..fc58a32ef7c0aa 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cc +++ b/paddle/fluid/operators/fused/fused_attention_op.cc @@ -124,9 +124,9 @@ class FusedAttentionOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(y_dim.size(), 2, common::errors::InvalidArgument( - "The dimensions of qkv_weight must be 2 if enable" - "transpose_qkv_wb: (dim_embed, 3 * dim_embed)," - "but received dimensions of" + "The dimensions of qkv_weight must be 2 if enable " + "transpose_qkv_wb: (dim_embed, 3 * dim_embed), " + "but received dimensions of " "Input is [%d]", y_dim.size())); PADDLE_ENFORCE_GT(num_heads, @@ -159,7 +159,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(y_dim.size(), 4, common::errors::InvalidArgument( - "The dimensions of qkv_weight must be 4 if not" + "The dimensions of qkv_weight must be 4 if not " "enable transpose_qkv_wb: (3, num_head, dim_head, " "dim_embed), but received [%d]", y_dim.size())); @@ -186,8 +186,8 @@ class FusedAttentionOp : public framework::OperatorWithKernel { x_dim.size(), 3, common::errors::InvalidArgument("The dimensions of x must be 3" - "(batch_size, seq_len, dim_embed)," - "but received dimensions of" + "(batch_size, seq_len, dim_embed), " + "but received dimensions of " "Input is [%d]", x_dim.size())); @@ -431,7 +431,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker { "attn_dropout_implementation", "[\"downgrade_in_infer\"|\"upscale_in_train\"]" "There are two kinds of ways to implement dropout" - "(the mask below is a tensor have the same shape with input" + "(the mask below is a tensor have the same shape with input, " "the value of mask is 0 or 1, the ratio of 0 is dropout_rate)" "1. downgrade_in_infer(default), downgrade the outcome at inference " "time" diff --git a/paddle/fluid/operators/fused/fused_conv2d_op.cc b/paddle/fluid/operators/fused/fused_conv2d_op.cc index 04d2d4043bf966..fb7bb428ef24ba 100644 --- a/paddle/fluid/operators/fused/fused_conv2d_op.cc +++ b/paddle/fluid/operators/fused/fused_conv2d_op.cc @@ -53,13 +53,13 @@ TODO: Documentation of conv2d op. protected: void Apply() { AddInput("Bias", - "(Tensor) Bias to be added to each output of filter application." - "The format of output tensor is X (one-dimensional) of size equal" + "(Tensor) Bias to be added to each output of filter application. " + "The format of output tensor is X (one-dimensional) of size equal " "to the number of output channels. Only used with MKL-DNN.") .AsDispensable(); AddInput("ResidualData", "(Tensor) Tensor with residual data " - "to which convolution output will be added." + "to which convolution output will be added. " "Used with fuse_residual_connection fusion.") .AsDispensable(); AddAttr("fuse_activation", diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc index 28a87239f37693..c4a1ce652c905d 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc +++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc @@ -72,16 +72,16 @@ class FusedMultiTransformerINT8Op : public framework::OperatorWithKernel { x_dim.size(), 3, common::errors::InvalidArgument("The dimensions of x must be 3" - "(batch_size, seq_len, dim_embed)," - "but received dimensions of" + "(batch_size, seq_len, dim_embed), " + "but received dimensions of " "Input is [%d]", x_dim.size())); PADDLE_ENFORCE_EQ( y_dim.size(), 4, common::errors::InvalidArgument("The dimensions of qkv_weight must be 4" - "(3, num_head, dim_head, dim_embed)," - "but received dimensions of" + "(3, num_head, dim_head, dim_embed), " + "but received dimensions of " "Input is [%d]", y_dim.size())); PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc index 93c688d149ac77..d2a262e2bac763 100644 --- a/paddle/fluid/operators/fused/multi_gru_op.cc +++ b/paddle/fluid/operators/fused/multi_gru_op.cc @@ -64,7 +64,7 @@ void MultiGRUOp::InferShape(framework::InferShapeContext* ctx) const { wx_dims[i][0], x_mat_dims[1], common::errors::InvalidArgument( - "The first dimension of flattened WeightX #%d" + "The first dimension of flattened WeightX #%d " "should equal to last dimension of flattened input X, but " "received fattened WeightX dimension is:%d, flattened X dimension " "is:%d", @@ -205,7 +205,7 @@ void MultiGRUOpMaker::Make() { "Number of stacked GRU layers.") .SetDefault(1); AddAttr("origin_mode", - "bool" + "bool " "use origin mode in article https://arxiv.org/abs/1412.3555") .SetDefault(false); AddAttr( diff --git a/paddle/fluid/operators/memcpy_d2h_op.cc b/paddle/fluid/operators/memcpy_d2h_op.cc index 8033cdb6489016..771c0ff19ce2c8 100644 --- a/paddle/fluid/operators/memcpy_d2h_op.cc +++ b/paddle/fluid/operators/memcpy_d2h_op.cc @@ -62,12 +62,13 @@ class MemcpyD2HOpProtoMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(phi::DenseTensor) The type of output " "is the same as input X."); - AddAttr("dst_place_type", - "Determine the dst place of tensor copy. " - "By Now it ONLY support XPU/CUDAPlace <-> CUDAPinnedPlace/CPU" - "Other place type is Unimplemented and will cause ERROR." - "0: dst is on CPUPlace. " - "1: dst is on CUDAPinnedPlace. "); + AddAttr( + "dst_place_type", + "Determine the dst place of tensor copy. " + "By Now it ONLY support XPU/CUDAPlace <-> CUDAPinnedPlace/CPU. " + "Other place type is Unimplemented and will cause ERROR. " + "0: dst is on CPUPlace. " + "1: dst is on CUDAPinnedPlace. "); AddComment(R"DOC( MemcpyD2H Operator. By now, it ONLY supports the memcopy between CUDAPlace <-> CUDAPinnedPlace/CPU. diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc index a65758f5ecf8a8..5a01de461429a2 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.cc +++ b/paddle/fluid/operators/memcpy_h2d_op.cc @@ -64,10 +64,10 @@ class MemcpyH2DOpProtoMaker : public framework::OpProtoAndCheckerMaker { "is the same as input X."); AddAttr("dst_place_type", "Determine the dst place of tensor copy. " - "By Now it support:" - "0. CUDAPinnedPlace/CPU <->CUDAPlace" - "1. CPU <->XPUPlace" - "2. CPU <->IPUPlace" + "By Now it support: " + "0. CUDAPinnedPlace/CPU <->CUDAPlace. " + "1. CPU <->XPUPlace. " + "2. CPU <->IPUPlace. " "Other place type is Unimplemented and will cause ERROR."); AddComment(R"DOC( MemcpyD2H Operator. diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc index 88737990847f34..cef735b1fdac82 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc @@ -50,7 +50,7 @@ class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "The input tensor of sequence_mask op."); AddOutput("Y", "The output mask of sequence_mask op."); AddInput("MaxLenTensor", - "Max length tensor" + "Max length tensor " "have higher priority than maxlen attribute") .AsDispensable(); AddAttr("maxlen", diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 79a93bfca9e9e6..8fde85928e4070 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -127,8 +127,8 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput( "X", - "A Variable list. The shape and data type of the list elements" - "should be consistent. Variable can be multi-dimensional Tensor" + "A Variable list. The shape and data type of the list elements " + "should be consistent. Variable can be multi-dimensional Tensor " "or phi::DenseTensor, and data types can be: float32, float64, int32, " "int64.") .AsDuplicable(); diff --git a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py index c426d3325a0811..0c70fbb72f98d3 100644 --- a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py @@ -134,6 +134,7 @@ 'KthvalueInferMeta', 'MaxPoolWithIndexInferMeta', 'MaxPoolV2InferMeta', + 'MinMaxWithIndexInferMeta', 'MultinomialInferMeta', 'OverlapAddInferMeta', 'PadInferMeta', diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc index 6750759633d0b8..9bf285da4d77a9 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc @@ -315,19 +315,37 @@ bool AnyOpInferSymbolicShape(pir::Operation *op, axis.size() == 0 /*reduce_all*/); } -bool ArgmaxOpInferSymbolicShape(pir::Operation *op, - pir::InferSymbolicShapeContext *infer_context) { +bool MinMaxOpInferSymbolicShape(pir::Operation *op, + pir::InferSymbolicShapeContext *infer_context, + bool output_val_and_ind = false) { bool flatten = GetBoolAttr(op, "flatten"); - bool keepdims = GetBoolAttr(op, "keepdims"); + bool keepdims = false; + int axis = 0; + + if (output_val_and_ind) { + keepdims = GetBoolAttr(op, "keepdim"); + PADDLE_ENFORCE_NE( + op->attributes().find("dim"), + op->attributes().end(), + common::errors::InvalidArgument( + "'dim' Attribute is expected for Min/MaxWithIndexOp. ")); + axis = op->attributes() + .at("dim") + .dyn_cast() + .data() + .to(); + } else { + keepdims = GetBoolAttr(op, "keepdims"); + const auto &axis_shape_or_data = + infer_context->GetShapeOrDataForValue(op->operand_source(1)); + axis = static_cast( + axis_shape_or_data.data().value().at(0).Get()); + } const auto &input_sym_shape = infer_context->GetShapeOrDataForValue(op->operand_source(0)).shape(); - int rank = input_sym_shape.size(); - const auto &axis_shape_or_data = - infer_context->GetShapeOrDataForValue(op->operand_source(1)); - int axis = - static_cast(axis_shape_or_data.data().value().at(0).Get()); + int rank = input_sym_shape.size(); if (axis < 0) axis += rank; const auto &out_sym_shape = [&] { @@ -357,14 +375,31 @@ bool ArgmaxOpInferSymbolicShape(pir::Operation *op, symbol::TensorShapeOrDataDimExprs(out_sym_shape)}; infer_context->SetShapeOrDataForValue(op->result(0), shape_data); + if (output_val_and_ind) + infer_context->SetShapeOrDataForValue(op->result(1), shape_data); return true; } +#define DEFINE_MINMAX_OP_INFER_FUNC(OpName, output_val_and_ind) \ + bool OpName##OpInferSymbolicShape( \ + pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { \ + return MinMaxOpInferSymbolicShape(op, infer_context, output_val_and_ind); \ + } + +DEFINE_MINMAX_OP_INFER_FUNC(Argmax, false) +DEFINE_MINMAX_OP_INFER_FUNC(MaxWithIndex, true) +#undef DEFINE_MINMAX_OP_INFER_FUNC + bool ArgminOpInferSymbolicShape(pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { return ArgmaxOpInferSymbolicShape(op, infer_context); } +bool MinWithIndexOpInferSymbolicShape( + pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { + return MaxWithIndexOpInferSymbolicShape(op, infer_context); +} + bool AsComplexOpInferSymbolicShape( pir::Operation *op, pir::InferSymbolicShapeContext *infer_context) { pir::Value operand_source = op->operand_source(0); diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h index 9868d08d8a290d..8d21b51eb2719f 100755 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h @@ -93,8 +93,10 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(Lu) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Lu_) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Mode) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Max) +OP_DECLARE_INFER_SYMBOLIC_SHAPE(MaxWithIndex) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Maxout) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Min) +OP_DECLARE_INFER_SYMBOLIC_SHAPE(MinWithIndex) OP_DECLARE_INFER_SYMBOLIC_SHAPE(Mean) OP_DECLARE_INFER_SYMBOLIC_SHAPE(MeanAll) OP_DECLARE_INFER_SYMBOLIC_SHAPE(MatrixPower) diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index d2af764fc392d7..1067c4e6854e3b 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -901,7 +901,7 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { true, common::errors::PreconditionNotMet( "Could not parse args and kwargs successfully, " - "please check your input first and make" + "please check your input first and make " "sure you are on the right way. " "The expected arguments as follow: (" "value, place, persistable, zero_copy, " @@ -1307,7 +1307,7 @@ int StringTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { true, common::errors::PreconditionNotMet( "Could not parse args and kwargs successfully, " - "please check your input first and make" + "please check your input first and make " "sure you are on the right way. " "The expected arguments as follow: (" "value, zero_copy, name, dims)")); diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index 15b78262ef8e0b..b5e4bb3e82a6bc 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -1353,8 +1353,8 @@ static PyObject* eager_api_set_master_grads(PyObject* self, PADDLE_ENFORCE_NE( grad, nullptr, - common::errors::Fatal("Detected nullptr grad" - "Please check if you have manually cleared" + common::errors::Fatal("Detected nullptr grad. " + "Please check if you have manually cleared " "the grad inside autograd_meta")); if (((*grad).has_allocation() || (*grad).is_dist_tensor()) && ((*grad).dtype() == phi::DataType::FLOAT16 || diff --git a/paddle/fluid/pybind/eager_generator.cc b/paddle/fluid/pybind/eager_generator.cc index 0ecd4c6263c1bf..e6b8e0ccb86bba 100644 --- a/paddle/fluid/pybind/eager_generator.cc +++ b/paddle/fluid/pybind/eager_generator.cc @@ -502,6 +502,7 @@ static void SlotNameMatching( grad_fwd_slotname_map[grad_slot_name] != fwd_slot_name) { PADDLE_THROW(common::errors::Fatal( "Detected mismatched slot names." + "Detected mismatched slot names: " "grad_slot_name %s matches both %s and %s fwd_slot_name", grad_slot_name, grad_fwd_slotname_map[grad_slot_name], @@ -536,7 +537,7 @@ static void SlotNameMatching( if (grad_fwd_slotname_map.count(grad_slot_name) && grad_fwd_slotname_map[grad_slot_name] != fwd_slot_name) { PADDLE_THROW(common::errors::Fatal( - "Detected mismatched slot names" + "Detected mismatched slot names: " "grad_slot_name %s matches both %s and %s fwd_slot_name", grad_slot_name, grad_fwd_slotname_map[grad_slot_name], diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 8af90c243833d3..2aa7606619bb4b 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -931,8 +931,8 @@ static PyObject* tensor_clear_gradient(TensorObject* self, grad = egr::EagerUtils::mutable_grad(self->tensor); PADDLE_ENFORCE( grad != nullptr, - common::errors::Fatal("Detected nullptr grad" - "Please check if you have manually cleared" + common::errors::Fatal("Detected nullptr grad. " + "Please check if you have manually cleared " "the grad inside autograd_meta")); } else { auto meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor); @@ -995,8 +995,8 @@ static PyObject* tensor__zero_grads(TensorObject* self, paddle::Tensor* grad = egr::EagerUtils::mutable_grad(self->tensor); PADDLE_ENFORCE( grad != nullptr, - common::errors::Fatal("Detected nullptr grad" - "Please check if you have manually cleared" + common::errors::Fatal("Detected nullptr grad. " + "Please check if you have manually cleared " "the grad inside autograd_meta")); if (grad->initialized()) { if (grad->is_dense_tensor() || grad->is_dist_tensor()) { diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index cd0d67efcd4439..d89c8eb8418e52 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -311,8 +311,8 @@ int tensor_properties_set_grad(TensorObject* self, paddle::Tensor* grad = egr::EagerUtils::mutable_grad(self->tensor); PADDLE_ENFORCE( grad != nullptr, - common::errors::Fatal("Detected NULL grad" - "Please check if you have manually cleared" + common::errors::Fatal("Detected NULL grad. " + "Please check if you have manually cleared " "the grad inside autograd_meta")); const phi::distributed::ProcessMesh* mesh = nullptr; if (InputsContainDistTensor(&mesh, src, self->tensor, *grad)) { @@ -334,8 +334,8 @@ int tensor_properties_set_grad_(TensorObject* self, paddle::Tensor* grad = egr::EagerUtils::mutable_grad(self->tensor); PADDLE_ENFORCE( grad != nullptr, - common::errors::Fatal("Detected NULL grad" - "Please check if you have manually cleared" + common::errors::Fatal("Detected NULL grad. " + "Please check if you have manually cleared " "the grad inside autograd_meta")); *grad = src; return 0; diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index cddb8c4e90bc16..4319540cacdaf9 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -3097,4 +3097,16 @@ void EagerSetDeviceId() { } } +paddle::optional GetInputOutTensorFromKwargs(PyObject* kwargs) { + if (!kwargs) { + return paddle::none; + } + PyObject* obj = PyDict_GetItemString(kwargs, "out"); + if (obj && PyObject_TypeCheck(obj, p_tensor_type)) { + return paddle::make_optional( + &(reinterpret_cast(obj)->tensor)); + } + return paddle::none; +} + } // namespace paddle::pybind diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index e0a1c035b353d5..95d4ac9fd2424c 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -514,5 +514,7 @@ CvtPlacements(phi::distributed::Placements placements, int ndim); void EagerSetDeviceId(); +paddle::optional GetInputOutTensorFromKwargs(PyObject* kwargs); + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index eefe6bf8e4e0ea..c7869861793036 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -737,7 +737,7 @@ void BindImperative(py::module *m_ptr) { } else { PADDLE_THROW(common::errors::InvalidArgument( "Incompatible Place Type: supports XPUPlace, CUDAPlace, " - "CPUPlace, IPUPlace, XPUPinnedPlace" + "CPUPlace, IPUPlace, XPUPinnedPlace " "and CUDAPinnedPlace, " "but got Unknown Type!")); } diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc index 288a5bc95f18dd..cb73b45fa4cb0f 100644 --- a/paddle/fluid/pybind/pir.cc +++ b/paddle/fluid/pybind/pir.cc @@ -337,7 +337,7 @@ void PruneWithInput(const std::vector &input_vars, if (!input_vars_set.empty() && SomeInSet(op_results, input_vars_set)) { PADDLE_THROW(common::errors::InvalidArgument( "The input_var create by: '{%s}' is not involved in the " - "output_vars calculation" + "output_vars calculation. " "Please remove it from input_vars.", op->name())); } diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc index 1e00d51799fad9..9872001ece2ec6 100644 --- a/paddle/fluid/pybind/place.cc +++ b/paddle/fluid/pybind/place.cc @@ -342,9 +342,9 @@ void BindPlace(pybind11::module &m) { // NOLINT } #else LOG(ERROR) << string::Sprintf( - "Cannot use CustomDevice because you have installed CPU/GPU" + "Cannot use CustomDevice because you have installed CPU/GPU " "version PaddlePaddle.\n" - "If you want to use CustomDevice, please try to install" + "If you want to use CustomDevice, please try to install " "CustomDevice version " "PaddlePaddle by: pip install paddlepaddle\n" "If you only have CPU, please change " diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c95f73763ca956..07bf166122bbb1 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2596,7 +2596,7 @@ All parameter, weight, gradient are variables in Paddle. VLOG(1) << string::Sprintf( "Cannot use get_all_device_type because you have installed " "CPU/GPU version PaddlePaddle.\n" - "If you want to use get_all_device_type, please try to install" + "If you want to use get_all_device_type, please try to install " "CustomDevice version " "PaddlePaddle by: pip install paddlepaddle\n"); #endif @@ -2624,7 +2624,7 @@ All parameter, weight, gradient are variables in Paddle. VLOG(1) << string::Sprintf( "Cannot use get_available_device because you have installed " "CPU/GPU version PaddlePaddle.\n" - "If you want to use get_available_device, please try to install" + "If you want to use get_available_device, please try to install " "CustomDevice version " "PaddlePaddle by: pip install paddlepaddle\n"); #endif @@ -2639,7 +2639,7 @@ All parameter, weight, gradient are variables in Paddle. "Cannot use get_available_custom_device because you have " "installed CPU/GPU version PaddlePaddle.\n" "If you want to use get_available_custom_device, please try to " - "install" + "install " "CustomDevice version " "PaddlePaddle by: pip install paddlepaddle\n"); #endif @@ -2657,7 +2657,7 @@ All parameter, weight, gradient are variables in Paddle. "Cannot use get_custom_device_count because you have " "installed CPU/GPU version PaddlePaddle.\n" "If you want to use get_custom_device_count, please try to " - "install" + "install " "CustomDevice version " "PaddlePaddle by: pip install paddlepaddle\n"); #endif diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h index 4be2fe7a31976d..73f62793dd55f3 100644 --- a/paddle/fluid/pybind/slice_utils.h +++ b/paddle/fluid/pybind/slice_utils.h @@ -529,7 +529,7 @@ static void ParseIndex(const paddle::Tensor& tensor, PADDLE_ENFORCE_EQ(slice_tensor.shape()[i], dim_len, common::errors::OutOfRange( - "The shape of boolean index %d did not match" + "The shape of boolean index %d did not match " "indexed tensor %d along axis %d.", slice_tensor.shape()[0], dim_len, diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index fe7ce761c9cbff..102316279b0970 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -512,7 +512,7 @@ void SetTensorFromPyArrayT( } else { PADDLE_THROW(common::errors::InvalidArgument( "Incompatible place type: Tensor.set() supports " - "CPUPlace, CUDAPlace" + "CPUPlace, CUDAPlace " "and CUDAPinnedPlace, but got %s!", place)); } diff --git a/paddle/phi/api/generator/api_base.py b/paddle/phi/api/generator/api_base.py index dc96c9e3c68353..708ae750c747dd 100644 --- a/paddle/phi/api/generator/api_base.py +++ b/paddle/phi/api/generator/api_base.py @@ -238,7 +238,9 @@ def get_grad_output(self, inplace_flag): else: return f"""std::make_tuple({", ".join(args)})""" - def get_declare_args(self, inplace_flag=False): + def get_declare_args( + self, inplace_flag=False, grad_flag=False, append_input_out=False + ): declare_args = self.get_input_tensor_args(inplace_flag) for name in self.attrs['names']: default_value = '' @@ -248,13 +250,37 @@ def get_declare_args(self, inplace_flag=False): self.attrs['attr_info'][name][0] + ' ' + name + default_value ) + if ( + not grad_flag + and not inplace_flag + and append_input_out + and len(self.outputs['names']) == 1 + and self.outputs['types'][0] == "Tensor" + and self.api != "empty_like" + ): + declare_args.append( + "paddle::optional input_out = paddle::none" + ) + return ", ".join(declare_args) - def get_define_args(self, inplace_flag=False): + def get_define_args( + self, inplace_flag=False, grad_flag=False, append_input_out=True + ): define_args = self.get_input_tensor_args(inplace_flag) for name in self.attrs['names']: define_args.append(self.attrs['attr_info'][name][0] + ' ' + name) + if ( + not grad_flag + and not inplace_flag + and append_input_out + and len(self.outputs['names']) == 1 + and self.outputs['types'][0] == "Tensor" + and self.api != "empty_like" + ): + define_args.append("paddle::optional input_out") + return ", ".join(define_args) def parse_args(self, api_name, api_item_yaml): @@ -518,12 +544,12 @@ def parse_data_transform(self, api_item_yaml): def get_return_type(self, inplace_flag=False): return None - def gene_api_declaration(self): + def gene_api_declaration(self, grad_flag=False, append_input_out=True): api_declaration = "" api_func_name = self.get_api_func_name() if api_func_name[-1] != '_': api_declaration = f""" -PADDLE_API {self.get_return_type()} {api_func_name}({self.get_declare_args()}); +PADDLE_API {self.get_return_type()} {api_func_name}({self.get_declare_args(grad_flag=grad_flag, append_input_out=append_input_out)}); """ if self.is_base_api and len(self.inplace_map) > 0: @@ -532,7 +558,7 @@ def gene_api_declaration(self): api_declaration = ( api_declaration + f""" -PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True)}); +PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True, grad_flag=grad_flag, append_input_out=append_input_out)}); """ ) @@ -1572,7 +1598,7 @@ def gene_invoke_code(self, invoke_code, params_code): return {invoke_code}; }}""" - def gene_api_code(self): + def gene_api_code(self, grad_flag=False, append_input_out=True): if self.is_base_api: api_code = self.gene_base_api_code() if len(self.inplace_map) > 0: @@ -1585,5 +1611,7 @@ def gene_api_code(self): return '' else: invoke_code = self.invoke - params_code = self.get_define_args() + params_code = self.get_define_args( + grad_flag=grad_flag, append_input_out=append_input_out + ) return self.gene_invoke_code(invoke_code, params_code) diff --git a/paddle/phi/api/generator/api_gen.py b/paddle/phi/api/generator/api_gen.py index 68b06b022381e2..363371854a7128 100644 --- a/paddle/phi/api/generator/api_gen.py +++ b/paddle/phi/api/generator/api_gen.py @@ -217,7 +217,20 @@ def gene_output( if inplace_flag and self.outputs['names'][0] in self.inplace_map else "" ) - output_create = f""" + if ( + len(self.outputs['names']) == 1 + and self.outputs['types'][0] == "Tensor" + and not ( + inplace_flag + and self.outputs['names'][0].split('@')[0] + in self.inplace_map + ) + and self.api != "empty_like" + ): + output_create = f""" +{code_indent} Tensor out_tmp; Tensor& api_output = input_out ? **input_out : out_tmp;""" + else: + output_create = f""" {code_indent} {return_type} api_output{inplace_assign};""" set_out_func = ( 'SetKernelOutput' @@ -416,14 +429,16 @@ def reset_view_after_fallback( class BackwardAPI(ForwardAPI): - def gene_base_api_code(self, inplace_flag=False): + def gene_base_api_code( + self, inplace_flag=False, grad_flag=False, append_input_out=True + ): api_func_name = self.get_api_func_name() if inplace_flag and api_func_name[-1] != '_': inplace_name = api_func_name + '_' else: inplace_name = api_func_name api_code = f""" -PADDLE_API {self.get_return_type(inplace_flag)} {inplace_name}({self.get_define_args(inplace_flag)}) {{ +PADDLE_API {self.get_return_type(inplace_flag)} {inplace_name}({self.get_define_args(inplace_flag, grad_flag=grad_flag, append_input_out=append_input_out)}) {{ {self.get_grad_outputs_define(inplace_flag)} {self.get_optional_inputs_change(inplace_flag)} {api_func_name}({self.get_grad_api_call_args(inplace_flag)}); @@ -432,7 +447,7 @@ def gene_base_api_code(self, inplace_flag=False): """ return api_code - def gene_api_code(self): + def gene_api_code(self, grad_flag=False, append_input_out=False): if not self.is_base_api and not self.is_only_composite_api: invoke_func_name = self.invoke.split('(')[0] if (not invoke_func_name.endswith("_grad")) and ( @@ -443,14 +458,17 @@ def gene_api_code(self): if self.is_only_composite_api: return "" - api_code = self.gene_base_api_code() + api_code = self.gene_base_api_code( + grad_flag=grad_flag, append_input_out=append_input_out + ) if self.is_base_api and len(self.inplace_map) > 0: if self.api[-1] == '_': api_code = "" api_code = api_code + self.gene_base_api_code_for_inplace() + return api_code - def gene_api_declaration(self): + def gene_api_declaration(self, grad_flag=False, append_input_out=True): if not self.is_base_api and not self.is_only_composite_api: invoke_func_name = self.invoke.split('(')[0] if (not invoke_func_name.endswith("_grad")) and ( @@ -465,7 +483,7 @@ def gene_api_declaration(self): api_func_name = self.get_api_func_name() if api_func_name[-1] != '_': api_declaration = f""" -PADDLE_API {self.get_return_type()} {api_func_name}({self.get_declare_args()}); +PADDLE_API {self.get_return_type()} {api_func_name}({self.get_declare_args(append_input_out=append_input_out)}); """ if self.is_base_api and len(self.inplace_map) > 0: @@ -474,7 +492,7 @@ def gene_api_declaration(self): api_declaration = ( api_declaration + f""" -PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True)}); +PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True,append_input_out=append_input_out)}); """ ) @@ -633,12 +651,20 @@ def generate_api( if forward_api.is_dygraph_api and is_fused_ops_yaml: forward_api.is_dygraph_api = False - header_file.write(forward_api.gene_api_declaration()) - source_file.write(forward_api.gene_api_code()) + header_file.write( + forward_api.gene_api_declaration( + grad_flag=grad_flag, append_input_out=not grad_flag + ) + ) + source_file.write(forward_api.gene_api_code(grad_flag=grad_flag)) forward_api.is_dygraph_api = True - header_file.write(forward_api.gene_api_declaration()) - source_file.write(forward_api.gene_api_code()) + header_file.write( + forward_api.gene_api_declaration( + grad_flag=grad_flag, append_input_out=not grad_flag + ) + ) + source_file.write(forward_api.gene_api_code(grad_flag=grad_flag)) header_file.write(namespace[1]) source_file.write(namespace[1]) diff --git a/paddle/phi/api/generator/backward_api_gen.py b/paddle/phi/api/generator/backward_api_gen.py index 320209d7483b3d..86d491460d5cf9 100644 --- a/paddle/phi/api/generator/backward_api_gen.py +++ b/paddle/phi/api/generator/backward_api_gen.py @@ -89,15 +89,23 @@ def check_args(self, forward_config): ), f"{self.api} : Output error: The number of outputs should be less then the number of inputs of forward api. \ Please check the output of {self.api} in yaml." - def get_declare_args(self, inplace_flag=False): - return self.get_define_args() + def get_declare_args( + self, inplace_flag=False, grad_flag=False, append_input_out=False + ): + return self.get_define_args( + grad_flag=grad_flag, append_input_out=append_input_out + ) - def get_define_args(self, inplace_flag=False): + def get_define_args( + self, inplace_flag=False, grad_flag=False, append_input_out=False + ): out_type_map = { 'Tensor': 'Tensor*', 'std::vector': 'std::vector', } - inputs_and_attrs = super().get_define_args() + inputs_and_attrs = super().get_define_args( + grad_flag=grad_flag, append_input_out=False + ) outs = [] for i, name in enumerate(self.outputs['names']): outs.append( @@ -111,7 +119,7 @@ def get_define_args(self, inplace_flag=False): def gene_return_code(self): return "" - def gene_api_declaration(self): + def gene_api_declaration(self, grad_flag=False, append_input_out=False): if not self.is_base_api and not self.is_only_composite_api: invoke_func_name = self.invoke.split('(')[0] if (not invoke_func_name.endswith("_grad")) and ( diff --git a/paddle/phi/api/generator/dist_api_gen.py b/paddle/phi/api/generator/dist_api_gen.py index 72d2afccfafb42..ed47941a61570d 100644 --- a/paddle/phi/api/generator/dist_api_gen.py +++ b/paddle/phi/api/generator/dist_api_gen.py @@ -1140,9 +1140,16 @@ def generate_output_creation_code(self) -> str: return_type, inplace_assign_code ) else: - output_creation_code += API_OUT_CREATION_TEMPLATE.format( - return_type, "" - ) + if ( + len(self.outputs['names']) == 1 + and self.outputs['types'][0] == "Tensor" + and self.api != "empty_like" + ): + output_creation_code += "Tensor out_tmp; Tensor& api_output = input_out ? **input_out : out_tmp;" + else: + output_creation_code += API_OUT_CREATION_TEMPLATE.format( + return_type, "" + ) # kernel output generate self.dist_output_args.append('dist_out') self.dense_output_args.append('dense_out') @@ -2092,7 +2099,9 @@ def check_argument_whether_support_auto_parallel(self): return True # override BaseAPI's method - def gene_base_api_code(self, inplace_flag=False): + def gene_base_api_code( + self, inplace_flag=False, grad_flag=False, append_input_out=True + ): # init status self.inplace_flag = inplace_flag self.dist_output_args = [] @@ -2159,14 +2168,25 @@ def gene_base_api_code(self, inplace_flag=False): class DistBackwardAPI(DistForwardAPI): - def gene_base_api_code(self, inplace_flag=False): - return BackwardAPI.gene_base_api_code(self, inplace_flag) + def gene_base_api_code( + self, inplace_flag=False, grad_flag=False, append_input_out=True + ): + return BackwardAPI.gene_base_api_code( + self, + inplace_flag, + grad_flag=grad_flag, + append_input_out=append_input_out, + ) - def gene_api_code(self): - return BackwardAPI.gene_api_code(self) + def gene_api_code(self, grad_flag=False, append_input_out=False): + return BackwardAPI.gene_api_code( + self, grad_flag=grad_flag, append_input_out=append_input_out + ) - def gene_api_declaration(self): - return BackwardAPI.gene_api_declaration(self) + def gene_api_declaration(self, grad_flag=False, append_input_out=True): + return BackwardAPI.gene_api_declaration( + self, grad_flag=grad_flag, append_input_out=append_input_out + ) def generate_api( @@ -2233,12 +2253,22 @@ def generate_api( if dist_forward_api.is_dygraph_api and is_fused_ops_yaml: dist_forward_api.is_dygraph_api = False - header_file.write(dist_forward_api.gene_api_declaration()) - source_file.write(dist_forward_api.gene_api_code()) + header_file.write( + dist_forward_api.gene_api_declaration( + grad_flag=grad_flag, append_input_out=not grad_flag + ) + ) + source_file.write( + dist_forward_api.gene_api_code(grad_flag=grad_flag) + ) dist_forward_api.is_dygraph_api = True - header_file.write(dist_forward_api.gene_api_declaration()) - source_file.write(dist_forward_api.gene_api_code()) + header_file.write( + dist_forward_api.gene_api_declaration( + grad_flag=grad_flag, append_input_out=not grad_flag + ) + ) + source_file.write(dist_forward_api.gene_api_code(grad_flag=grad_flag)) header_file.write(namespace[1]) source_file.write(namespace[1]) diff --git a/paddle/phi/api/generator/dist_bw_api_gen.py b/paddle/phi/api/generator/dist_bw_api_gen.py index 2d7abedfb02061..b85e40b59fa80d 100644 --- a/paddle/phi/api/generator/dist_bw_api_gen.py +++ b/paddle/phi/api/generator/dist_bw_api_gen.py @@ -417,8 +417,12 @@ def gene_return_code(self): return "" # override BaseAPI's method - def gene_api_declaration(self) -> str: - return BackwardAPI.gene_api_declaration(self) + def gene_api_declaration( + self, grad_flag=False, append_input_out=False + ) -> str: + return BackwardAPI.gene_api_declaration( + self, grad_flag=grad_flag, append_input_out=not grad_flag + ) def generate_reshard_output_code(self): reshard_output_code = "" diff --git a/paddle/phi/api/generator/sparse_api_gen.py b/paddle/phi/api/generator/sparse_api_gen.py index 97a8c9994f92fb..019900a9999660 100644 --- a/paddle/phi/api/generator/sparse_api_gen.py +++ b/paddle/phi/api/generator/sparse_api_gen.py @@ -23,10 +23,10 @@ class SparseAPI(ForwardAPI): def __init__(self, api_item_yaml): super().__init__(api_item_yaml) - def gene_api_declaration(self): + def gene_api_declaration(self, grad_flag=False, append_input_out=False): return f""" // {", ".join(self.outputs['names'])} -{super().gene_api_declaration()} +{super().gene_api_declaration(append_input_out=False)} """ def gene_output( @@ -392,7 +392,9 @@ def gene_dispatch_code(self, kernel_name, inplace_flag=False): }} """ - def gene_base_api_code(self, inplace_flag=False): + def gene_base_api_code( + self, inplace_flag=False, grad_flag=False, append_input_out=False + ): api_func_name = self.get_api_func_name() if inplace_flag and api_func_name[-1] != '_': api_func_name += '_' @@ -403,7 +405,7 @@ def gene_base_api_code(self, inplace_flag=False): ) return f""" -PADDLE_API {self.get_return_type(inplace_flag)} {api_func_name}({self.get_define_args(inplace_flag)}) {{ +PADDLE_API {self.get_return_type(inplace_flag)} {api_func_name}({self.get_define_args(inplace_flag, grad_flag=grad_flag, append_input_out=False)}) {{ {kernel_dispatch_code} PADDLE_THROW(common::errors::Unimplemented( "The kernel of ({self.api}) for input tensors is unimplemented, please check the type of input tensors.")); @@ -468,7 +470,9 @@ def api_namespace(): ) -def generate_api(api_yaml_path, header_file_path, source_file_path): +def generate_api( + api_yaml_path, header_file_path, source_file_path, grad_flag=False +): apis = [] for each_api_yaml in api_yaml_path: @@ -496,8 +500,16 @@ def generate_api(api_yaml_path, header_file_path, source_file_path): continue if sparse_api.is_dygraph_api: sparse_api.is_dygraph_api = False - header_file.write(sparse_api.gene_api_declaration()) - source_file.write(sparse_api.gene_api_code()) + header_file.write( + sparse_api.gene_api_declaration( + grad_flag=grad_flag, append_input_out=False + ) + ) + source_file.write( + sparse_api.gene_api_code( + grad_flag=grad_flag, append_input_out=False + ) + ) header_file.write(namespace[1]) source_file.write(namespace[1]) @@ -556,11 +568,14 @@ def main(): backward_api_yaml_path = options.backward_api_yaml_path backward_header_file_path = options.backward_api_header_path backward_source_file_path = options.backward_api_source_path - generate_api(api_yaml_path, header_file_path, source_file_path) + generate_api( + api_yaml_path, header_file_path, source_file_path, grad_flag=False + ) generate_api( backward_api_yaml_path, backward_header_file_path, backward_source_file_path, + grad_flag=True, ) diff --git a/paddle/phi/api/generator/sparse_bw_api_gen.py b/paddle/phi/api/generator/sparse_bw_api_gen.py index c5ebdd51f2e1e8..059504de8def02 100644 --- a/paddle/phi/api/generator/sparse_bw_api_gen.py +++ b/paddle/phi/api/generator/sparse_bw_api_gen.py @@ -35,14 +35,24 @@ def get_return_type(self, inplace_flag=False): def gene_return_code(self): return "return;" - def gene_api_declaration(self): - return SparseAPI.gene_api_declaration(self) + def gene_api_declaration(self, grad_flag=False, append_input_out=False): + return SparseAPI.gene_api_declaration( + self, grad_flag=grad_flag, append_input_out=False + ) - def get_declare_args(self, inplace_flag=False): - return BackwardAPI.get_declare_args(self) + def get_declare_args( + self, inplace_flag=False, grad_flag=False, append_input_out=False + ): + return BackwardAPI.get_declare_args( + self, grad_flag=grad_flag, append_input_out=False + ) - def get_define_args(self, inplace_flag=False): - return BackwardAPI.get_define_args(self) + def get_define_args( + self, inplace_flag=False, grad_flag=False, append_input_out=False + ): + return BackwardAPI.get_define_args( + self, grad_flag=grad_flag, append_input_out=False + ) def gene_output( self, @@ -157,7 +167,9 @@ def api_namespace(): ) -def generate_api(api_yaml_path, header_file_path, source_file_path): +def generate_api( + api_yaml_path, header_file_path, source_file_path, grad_flag=False +): with open(api_yaml_path, 'r') as f: apis = yaml.load(f, Loader=yaml.FullLoader) header_file = open(header_file_path, 'w') @@ -175,8 +187,16 @@ def generate_api(api_yaml_path, header_file_path, source_file_path): for api in apis: sparse_bw_api = SparseBackwardAPI(api) - header_file.write(sparse_bw_api.gene_api_declaration()) - source_file.write(sparse_bw_api.gene_api_code()) + header_file.write( + sparse_bw_api.gene_api_declaration( + grad_flag=grad_flag, append_input_out=False + ) + ) + source_file.write( + sparse_bw_api.gene_api_code( + grad_flag=grad_flag, append_input_out=False + ) + ) header_file.write(namespace[1]) source_file.write(namespace[1]) @@ -213,7 +233,9 @@ def main(): header_file_path = options.api_header_path source_file_path = options.api_source_path - generate_api(api_yaml_path, header_file_path, source_file_path) + generate_api( + api_yaml_path, header_file_path, source_file_path, grad_flag=True + ) if __name__ == '__main__': diff --git a/paddle/phi/api/generator/strings_api_gen.py b/paddle/phi/api/generator/strings_api_gen.py index c22b5a6e87b030..03097c50e5a550 100644 --- a/paddle/phi/api/generator/strings_api_gen.py +++ b/paddle/phi/api/generator/strings_api_gen.py @@ -31,7 +31,7 @@ def get_api_func_name(self): def gene_api_declaration(self): return f""" // {", ".join(self.outputs['names'])} -{super().gene_api_declaration()} +{super().gene_api_declaration(append_input_out=False)} """ def get_kernel_tensor_out_type(self, output_name): @@ -306,10 +306,12 @@ def gene_kernel_select(self) -> str: return kernel_select_code - def gene_base_api_code(self, inplace_flag=False): + def gene_base_api_code( + self, inplace_flag=False, grad_flag=False, append_input_out=False + ): api_func_name = self.get_api_func_name() return f""" -PADDLE_API {self.get_return_type(inplace_flag)} {api_func_name}({self.get_define_args(inplace_flag)}) {{ +PADDLE_API {self.get_return_type(inplace_flag)} {api_func_name}({self.get_define_args(inplace_flag, grad_flag=grad_flag, append_input_out=False)}) {{ {self.gene_kernel_select()} {self.gen_string_tensor_kernel_code(inplace_flag)} }} diff --git a/paddle/phi/api/generator/tensor_operants_gen.py b/paddle/phi/api/generator/tensor_operants_gen.py index ea1184bf0581a6..4b15b84d6f5768 100644 --- a/paddle/phi/api/generator/tensor_operants_gen.py +++ b/paddle/phi/api/generator/tensor_operants_gen.py @@ -479,11 +479,11 @@ def gene_operants_base(self): api_func_name = self.get_api_func_name() if api_func_name[-1] != '_': return f""" -{indent}virtual {self.get_return_type()} {api_func_name}({self.get_declare_args()}) = 0; +{indent}virtual {self.get_return_type()} {api_func_name}({self.get_declare_args(append_input_out=False)}) = 0; """ else: return f""" -{indent}virtual {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True)}) = 0; +{indent}virtual {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True, append_input_out=False)}) = 0; """ def get_declare_args_without_first_tensor(self, inplace_flag=False): @@ -553,11 +553,11 @@ def gene_operants_declaration(self): api_func_name = self.get_api_func_name() if api_func_name[-1] != '_': return f""" -{indent}{self.get_return_type()} {api_func_name}({self.get_declare_args()}); +{indent}{self.get_return_type()} {api_func_name}({self.get_declare_args(append_input_out=False)}); """ else: return f""" -{indent}{self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True)}); +{indent}{self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True, append_input_out=False)}); """ def gene_operants_implementation(self): @@ -567,13 +567,13 @@ def gene_operants_implementation(self): # func declaration if func_name[-1] != '_': return f""" -{self.get_return_type()} PhiTensorOperants::{func_name}({self.get_define_args()}) {{ +{self.get_return_type()} PhiTensorOperants::{func_name}({self.get_define_args(append_input_out=False)}) {{ {indent}return paddle::experimental::{func_name}({func_args_code}); }} """ else: return f""" -{self.get_return_type(inplace_flag=True)} PhiTensorOperants::{func_name}({self.get_define_args(inplace_flag=True)}) {{ +{self.get_return_type(inplace_flag=True)} PhiTensorOperants::{func_name}({self.get_define_args(inplace_flag=True,append_input_out=False)}) {{ {indent}return paddle::experimental::{func_name}({func_args_code}); }} @@ -640,14 +640,14 @@ def gene_operants_manager_implementation(self): return ( final_code + f""" -{self.get_return_type()} OperantsManager::{func_name}({self.get_define_args()}) {{{self.gene_operants_manager_code()}}} +{self.get_return_type()} OperantsManager::{func_name}({self.get_define_args(append_input_out=False)}) {{{self.gene_operants_manager_code()}}} """ ) else: return ( final_code + f""" -{self.get_return_type(inplace_flag=True)} OperantsManager::{func_name}({self.get_define_args(inplace_flag=True)}) {{ +{self.get_return_type(inplace_flag=True)} OperantsManager::{func_name}({self.get_define_args(inplace_flag=True,append_input_out=False)}) {{ {self.gene_operants_manager_code()} }} """ diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index 2f7d54eaa05e00..5ad401cbddb7b8 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -34,13 +34,20 @@ limitations under the License. */ #include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h" #include "paddle/phi/api/lib/data_transform.h" #endif +#include "paddle/utils/optional.h" COMMON_DECLARE_bool(use_stride_kernel); namespace paddle { namespace experimental { // declare cast api -Tensor cast(const Tensor &x, DataType out_dtype); -Tensor copy_to(const Tensor &x, const Place &place, bool blocking); +Tensor cast(const Tensor &x, +DataType out_dtype, +paddle::optional input_out = paddle::none); + +Tensor copy_to(const Tensor &x, +const Place &place, +bool blocking, + paddle::optional input_out = paddle::none); } // namespace experimental // TODO(chenweihang): Remove this namespace using-directives later diff --git a/paddle/phi/backends/xpu/xpu_info.cc b/paddle/phi/backends/xpu/xpu_info.cc index 7b37de7fce0d0c..8b3ff923b1d444 100644 --- a/paddle/phi/backends/xpu/xpu_info.cc +++ b/paddle/phi/backends/xpu/xpu_info.cc @@ -40,8 +40,8 @@ PHI_DEFINE_EXPORTED_string( "This option is useful when doing multi process training and " "each process have only one device (XPU). If you want to use " "all visible devices, set this to empty string. NOTE: the " - "reason of doing this is that we want to use P2P communication" - "between XPU devices, use XPU_VISIBLE_DEVICES can only use" + "reason of doing this is that we want to use P2P communication " + "between XPU devices, use XPU_VISIBLE_DEVICES can only use " "share-memory only."); namespace phi { diff --git a/paddle/phi/core/platform/cpu_helper.cc b/paddle/phi/core/platform/cpu_helper.cc index 751c0a3bd0f934..269d8fd8b6d2b1 100644 --- a/paddle/phi/core/platform/cpu_helper.cc +++ b/paddle/phi/core/platform/cpu_helper.cc @@ -51,7 +51,7 @@ void SetNumThreads(int num_threads) { return; #else PADDLE_THROW(common::errors::Unimplemented( - "This library (except OPENBLAS, MKLML) is not supported yet, so the" + "This library (except OPENBLAS, MKLML) is not supported yet, so the " "number of threads cannot be set.")); #endif } diff --git a/paddle/phi/core/platform/profiler.cc b/paddle/phi/core/platform/profiler.cc index 993db57c6d90eb..a03f55a3dcf9e6 100644 --- a/paddle/phi/core/platform/profiler.cc +++ b/paddle/phi/core/platform/profiler.cc @@ -625,7 +625,7 @@ void EnableProfiler(ProfilerState state) { PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled, common::errors::InvalidArgument( - "Can't enable profiling, since the input state is" + "Can't enable profiling, since the input state is " "ProfilerState::kDisabled")); SynchronizeAllDevice(); std::lock_guard l(profiler_mu); diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc index e7b6980f3b70bf..3a7e6eb108f1b9 100644 --- a/paddle/phi/infermeta/fusion.cc +++ b/paddle/phi/infermeta/fusion.cc @@ -2421,16 +2421,17 @@ void FusedMultiTransformerInt8InferMeta( } void FusedTransposeSplitQuantInferMeta(const MetaTensor& x, + const MetaTensor& input_scales, const IntArray& tokens_per_expert, bool pow_2_scales, std::vector outs, std::vector scales) { PADDLE_ENFORCE_EQ( - x.dtype(), - DataType::BFLOAT16, - common::errors::InvalidArgument( - "The dtype of Input(x) must be BFLOAT16, but received %s", - x.dtype())); + x.dtype() == DataType::BFLOAT16 || x.dtype() == DataType::FLOAT8_E4M3FN, + true, + common::errors::InvalidArgument("The dtype of Input(x) must be BFLOAT16 " + "or FLOAT8_E4M3FN, but received %s", + x.dtype())); auto x_dims = x.dims(); diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h index a3e6342b09f0a1..c1f6a988bf59b1 100644 --- a/paddle/phi/infermeta/fusion.h +++ b/paddle/phi/infermeta/fusion.h @@ -669,6 +669,7 @@ void FusedMultiTransformerInt8InferMeta( MetaTensor* out); void FusedTransposeSplitQuantInferMeta(const MetaTensor& x, + const MetaTensor& input_scales, const IntArray& tokens_per_expert, bool pow_2_scales, std::vector outs, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index a30e9fd2f035e4..933edfaa0ea1a3 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -366,6 +366,90 @@ void ArgMinMaxInferMeta(const MetaTensor& x, } } +void MinMaxWithIndexInferMeta(const MetaTensor& x, + const Scalar& axis, + bool keepdims, + bool flatten, + MetaTensor* val_out, + MetaTensor* ind_out, + MetaConfig config) { + DataType val_dtype = x.dtype(); + + if (!config.is_runtime && axis.FromTensor()) { + std::vector vec; + if (flatten) { + if (keepdims) { // NOLINT + vec = std::vector(x.dims().size(), -1); + } else { + vec = {}; + } + } else { + if (keepdims) { + vec = std::vector(x.dims().size(), -1); + } else { + vec = std::vector(x.dims().size() - 1, -1); + } + } + val_out->set_dims(common::make_ddim(vec)); + val_out->set_dtype(val_dtype); + ind_out->set_dims(common::make_ddim(vec)); + ind_out->set_dtype(DataType::INT64); + return; + } + auto int_axis = axis.to(); + const auto& x_dims = x.dims(); + + auto x_rank = x.dims().size(); + if (x_rank > 0) { + PADDLE_ENFORCE_GE(int_axis, + -x_rank, + common::errors::InvalidArgument( + "'axis'(%d) must be greater than or equal to" + " -Rank(X)(%d).", + int_axis, + -x_rank)); + PADDLE_ENFORCE_LT( + int_axis, + x_rank, + common::errors::InvalidArgument( + "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", + int_axis, + x_rank)); + } else { + // 0-dim tensor + PADDLE_ENFORCE_EQ(int_axis == 0 || int_axis == -1, + true, + common::errors::InvalidArgument( + "'axis'(%d) must be 0 or -1 if input tensor is " + "0-dim.", + int_axis)); + } + + if (int_axis < 0) int_axis += x_rank; + + std::vector vec; + if (flatten) { + if (keepdims) { // NOLINT + vec = std::vector(x.dims().size(), 1); + } else { + vec = {}; + } + } else { + for (int64_t i = 0; i < int_axis; i++) + vec.emplace_back(x_dims[static_cast(i)]); + if (keepdims) { + vec.emplace_back(static_cast(1)); + } + for (int64_t i = int_axis + 1; i < x_rank; i++) + vec.emplace_back(x_dims[static_cast(i)]); + } + + val_out->set_dims(common::make_ddim(vec)); + val_out->set_dtype(val_dtype); + ind_out->set_dims(common::make_ddim(vec)); + ind_out->set_dtype(DataType::INT64); +} + void ArgsortInferMeta(const MetaTensor& input, int axis, bool descending, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 7334ee476c0ad9..ea6c95748c16c5 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -66,6 +66,14 @@ void ArgMinMaxInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void MinMaxWithIndexInferMeta(const MetaTensor& x, + const Scalar& axis, + bool keepdims, + bool flatten, + MetaTensor* val_out, + MetaTensor* ind_out, + MetaConfig config = MetaConfig()); + void ArgsortInferMeta(const MetaTensor& input, int axis, bool descending, diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h index 80d7028a3082f9..a4a858ff8eaf8b 100644 --- a/paddle/phi/kernels/autotune/auto_tune_base.h +++ b/paddle/phi/kernels/autotune/auto_tune_base.h @@ -63,7 +63,7 @@ class AutoTuneBase { } template - void Run(const Context& ctx, + void Run(const Context& dev_ctx, const AlgorithmType& algo, const size_t key, Args&&... args) { @@ -78,7 +78,7 @@ class AutoTuneBase { if (use_autotune) { // All available kernels have ran while picking the best kernel, // so there may be no need for another kernel run. - auto best_idx = PickBestKernel(ctx, args...); + auto best_idx = PickBestKernel(dev_ctx, args...); cache.Set(key, best_idx); } else { kernels_[0].Run(args...); @@ -100,14 +100,14 @@ class AutoTuneBase { } template - size_t PickBestKernel(const Context& ctx, Args&&... args) { + size_t PickBestKernel(const Context& dev_ctx, Args&&... args) { std::lock_guard lock(mutex_); size_t best_idx = 0; float min_time = std::numeric_limits::max(); // Time cost test established in default stream. for (size_t i = 0; i < kernels_.size(); ++i) { - auto time = RunAndMeasureKernel(ctx, i, args...); + auto time = RunAndMeasureKernel(dev_ctx, i, args...); if (time < min_time) { min_time = time; best_idx = i; @@ -118,15 +118,17 @@ class AutoTuneBase { } template - float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) { + float RunAndMeasureKernel(const Context& dev_ctx, + const int idx, + Args&&... args) { // Regard 1st run as warmup, judge the compare result by the time cost // of rest cycles. constexpr int repeats = 11; phi::GpuTimer timer; float time_cost = 0; - const auto& stream = ctx.stream(); + const auto& stream = dev_ctx.stream(); - ctx.Wait(); + dev_ctx.Wait(); for (int i = 0; i < repeats; ++i) { timer.Start(stream); kernels_[idx].Run(args...); @@ -158,7 +160,7 @@ class MatmulAutoTuner } template - void Run(const Context& ctx, const size_t key, Args... args) { + void Run(const Context& dev_ctx, const size_t key, Args... args) { this->is_init_ = true; this->CheckKernelSize(); auto& cache = AutoTuneCache::Instance().GetMatmul(); @@ -168,7 +170,7 @@ class MatmulAutoTuner } else { bool use_autotune = AutoTuneStatus::Instance().UseAutoTune(); if (use_autotune) { - auto best_idx = this->PickBestKernel(ctx, args...); + auto best_idx = this->PickBestKernel(dev_ctx, args...); cache.Set(key, best_idx); } else { this->kernels_[0].Run(args...); @@ -210,7 +212,7 @@ class GatherGemmScatterAutoTuner return instance.get(); } - void Run(const phi::GPUContext& ctx, + void Run(const phi::GPUContext& dev_ctx, const size_t key, T const alpha, T const beta, @@ -227,15 +229,15 @@ class GatherGemmScatterAutoTuner } else { // Set alpha to 0 and beta to 1 to avoid changing the value of d when // picking the best kernel - auto best_idx = - PickBestKernel(ctx, static_cast(0), static_cast(1), args...); + auto best_idx = PickBestKernel( + dev_ctx, static_cast(0), static_cast(1), args...); cache.Set(key, best_idx); this->kernels_[best_idx].Run(alpha, beta, args...); } } protected: - size_t PickBestKernel(const phi::GPUContext& ctx, + size_t PickBestKernel(const phi::GPUContext& dev_ctx, const T& alpha, const T& beta, Args&... args) { @@ -250,7 +252,7 @@ class GatherGemmScatterAutoTuner // Some kernels may require more shared memory than available, skip these // kernels. try { - time = this->RunAndMeasureKernel(ctx, i, alpha, beta, args...); + time = this->RunAndMeasureKernel(dev_ctx, i, alpha, beta, args...); if (time < min_time) { min_time = time; best_idx = i; diff --git a/paddle/phi/kernels/cpu/min_max_with_index_kernel.cc b/paddle/phi/kernels/cpu/min_max_with_index_kernel.cc new file mode 100644 index 00000000000000..f373553389e422 --- /dev/null +++ b/paddle/phi/kernels/cpu/min_max_with_index_kernel.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/min_max_with_index_kernel.h" + +#include "paddle/common/ddim.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +#define DEFINE_WITH_INDEX_KERNEL(OpType, name) \ + template \ + void OpType##WithIndexKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const Scalar& dim, \ + bool keepdim, \ + bool flatten, \ + DenseTensor* val_out, \ + DenseTensor* ind_out) { \ + PADDLE_ENFORCE_EQ(0, \ + 1, \ + phi::errors::Unimplemented( \ + "In static graph mode, %s PHI kernel is not " \ + "currently available on non-GPU devices.", \ + #name)); \ + } \ + template \ + void OpType##WithIndexGradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& values, \ + const DenseTensor& indices, \ + const DenseTensor& values_grad, \ + const Scalar& dim, \ + bool keepdim, \ + DenseTensor* x_grad) { \ + PADDLE_ENFORCE_EQ(0, \ + 1, \ + phi::errors::Unimplemented( \ + "In static graph mode, %s PHI kernel is not " \ + "currently available on non-GPU devices.", \ + #name)); \ + } + +namespace phi { + +DEFINE_WITH_INDEX_KERNEL(Min, min_with_index) +DEFINE_WITH_INDEX_KERNEL(Max, max_with_index) +#undef DEFINE_WITH_INDEX_KERNEL + +} // namespace phi + +#define REGISTER_CPU_KERNELS(OpType, OpName) \ + PD_REGISTER_KERNEL(OpName, \ + CPU, \ + ALL_LAYOUT, \ + phi::OpType##WithIndexKernel, \ + phi::dtype::float16, \ + phi::dtype::bfloat16, \ + float, \ + double, \ + int32_t, \ + int64_t, \ + int16_t, \ + uint8_t) { \ + kernel->OutputAt(0).SetDataType(kernel->InputAt(0).dtype); \ + kernel->OutputAt(1).SetDataType(phi::DataType::INT64); \ + } \ + PD_REGISTER_KERNEL(OpName##_grad, \ + CPU, \ + ALL_LAYOUT, \ + phi::OpType##WithIndexGradKernel, \ + float, \ + double, \ + uint8_t, \ + int, \ + int16_t, \ + int64_t, \ + phi::dtype::float16, \ + phi::dtype::bfloat16) {} + +REGISTER_CPU_KERNELS(Min, min_with_index) +REGISTER_CPU_KERNELS(Max, max_with_index) +#undef REGISTER_CPU_KERNELS diff --git a/paddle/phi/kernels/funcs/cross_entropy.cc b/paddle/phi/kernels/funcs/cross_entropy.cc index 6616f07e68a10c..9fb68c155402f5 100644 --- a/paddle/phi/kernels/funcs/cross_entropy.cc +++ b/paddle/phi/kernels/funcs/cross_entropy.cc @@ -93,7 +93,7 @@ struct HardLabelCrossEntropyCPUFunctorImpl { template void CrossEntropyFunctor::operator()( - const DeviceContext& ctx, + const DeviceContext& dev_ctx, phi::DenseTensor* out, const phi::DenseTensor* prob, const phi::DenseTensor* labels, @@ -110,7 +110,7 @@ void CrossEntropyFunctor::operator()( auto lbl = EigenMatrix::From(*labels); auto loss = EigenMatrix::From(*out); - loss.device(*ctx.eigen_device()) = + loss.device(*dev_ctx.eigen_device()) = -((lbl * in.log().unaryExpr(phi::funcs::TolerableValue())) .reshape(batch_axis_remain) .sum(Eigen::DSizes(1))); diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h index f351e74260c022..b532b1a90163ca 100644 --- a/paddle/phi/kernels/funcs/elementwise_functor.h +++ b/paddle/phi/kernels/funcs/elementwise_functor.h @@ -1378,7 +1378,7 @@ struct CopySignGradXYFunctor { if (x == static_cast(0)) outs[0] = static_cast(0); else - outs[0] = static_cast(dout * (funcs::copysign_func(x, y)) / x); + outs[0] = static_cast(dout * (funcs::copysign_func(x, y) / x)); // dy = 0 outs[1] = static_cast(0); return outs; diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cc b/paddle/phi/kernels/funcs/gather_scatter_functor.cc index 95c9f69a2abfd8..f7274faebd6f08 100644 --- a/paddle/phi/kernels/funcs/gather_scatter_functor.cc +++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cc @@ -77,7 +77,7 @@ struct cpu_gather_scatter_functor { const std::string& method_name, const func_t& reduce_op, bool include_self, - const phi::DeviceContext& ctx UNUSED) { + const phi::DeviceContext& dev_ctx UNUSED) { if (index.numel() == 0) { return; } @@ -237,7 +237,7 @@ void cpu_gather_kernel(phi::DenseTensor self, const phi::DenseTensor& index, phi::DenseTensor result, bool include_self, - const phi::DeviceContext& ctx) { + const phi::DeviceContext& dev_ctx) { cpu_gather_scatter_functor()(result, @@ -247,7 +247,7 @@ void cpu_gather_kernel(phi::DenseTensor self, "gather_out_cpu", tensor_assign, include_self, - ctx); + dev_ctx); } template @@ -256,7 +256,7 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self, const phi::DenseTensor& index, phi::DenseTensor src, bool include_self, - const phi::DeviceContext& ctx) { + const phi::DeviceContext& dev_ctx) { cpu_gather_scatter_functor()(self, @@ -266,7 +266,7 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self, "scatter_assign_cpu", tensor_assign, include_self, - ctx); + dev_ctx); } template @@ -275,11 +275,17 @@ void cpu_scatter_add_kernel(phi::DenseTensor self, const phi::DenseTensor& index, phi::DenseTensor src, bool include_self, - const phi::DeviceContext& ctx) { + const phi::DeviceContext& dev_ctx) { cpu_gather_scatter_functor()( - self, dim, index, src, "scatter_add_cpu", reduce_add, include_self, ctx); + /*is_scatter_like=*/true>()(self, + dim, + index, + src, + "scatter_add_cpu", + reduce_add, + include_self, + dev_ctx); } template @@ -288,11 +294,17 @@ void cpu_scatter_mul_kernel(phi::DenseTensor self, const phi::DenseTensor& index, phi::DenseTensor src, bool include_self, - const phi::DeviceContext& ctx) { + const phi::DeviceContext& dev_ctx) { cpu_gather_scatter_functor()( - self, dim, index, src, "scatter_mul_cpu", reduce_mul, include_self, ctx); + /*is_scatter_like=*/true>()(self, + dim, + index, + src, + "scatter_mul_cpu", + reduce_mul, + include_self, + dev_ctx); } template @@ -301,11 +313,17 @@ void cpu_scatter_mean_kernel(phi::DenseTensor self, const phi::DenseTensor& index, phi::DenseTensor src, bool include_self, - const phi::DeviceContext& ctx) { + const phi::DeviceContext& dev_ctx) { cpu_gather_scatter_functor()( - self, dim, index, src, "scatter_mean_cpu", reduce_add, include_self, ctx); + /*is_scatter_like=*/true>()(self, + dim, + index, + src, + "scatter_mean_cpu", + reduce_add, + include_self, + dev_ctx); } template @@ -314,11 +332,17 @@ void cpu_scatter_max_kernel(phi::DenseTensor self, const phi::DenseTensor& index, phi::DenseTensor src, bool include_self, - const phi::DeviceContext& ctx) { + const phi::DeviceContext& dev_ctx) { cpu_gather_scatter_functor()( - self, dim, index, src, "scatter_max_cpu", reduce_max, include_self, ctx); + /*is_scatter_like=*/true>()(self, + dim, + index, + src, + "scatter_max_cpu", + reduce_max, + include_self, + dev_ctx); } template @@ -327,11 +351,17 @@ void cpu_scatter_min_kernel(phi::DenseTensor self, const phi::DenseTensor& index, phi::DenseTensor src, bool include_self, - const phi::DeviceContext& ctx) { + const phi::DeviceContext& dev_ctx) { cpu_gather_scatter_functor()( - self, dim, index, src, "scatter_min_cpu", reduce_min, include_self, ctx); + /*is_scatter_like=*/true>()(self, + dim, + index, + src, + "scatter_min_cpu", + reduce_min, + include_self, + dev_ctx); } template @@ -340,7 +370,7 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED, const phi::DenseTensor& index, phi::DenseTensor grad, bool include_self UNUSED, - const phi::DeviceContext& ctx UNUSED) { + const phi::DeviceContext& dev_ctx UNUSED) { auto* index_data = index.data(); auto* grad_data = grad.data(); @@ -376,16 +406,17 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED, } template -void cpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self UNUSED, - int dim, - const phi::DenseTensor& index, - const phi::DenseTensor& out, - const phi::DenseTensor& x, - const phi::DenseTensor& value, - phi::DenseTensor grad, - const std::string& reduce, - bool include_self UNUSED, - const phi::DeviceContext& ctx) { +void cpu_scatter_mul_min_max_input_grad_kernel( + phi::DenseTensor self UNUSED, + int dim, + const phi::DenseTensor& index, + const phi::DenseTensor& out, + const phi::DenseTensor& x, + const phi::DenseTensor& value, + phi::DenseTensor grad, + const std::string& reduce, + bool include_self UNUSED, + const phi::DeviceContext& dev_ctx) { auto* index_data = index.data(); auto* grad_data = grad.data(); auto* out_data = out.data(); @@ -457,7 +488,8 @@ void cpu_scatter_mean_input_grad_kernel(phi::DenseTensor self UNUSED, const phi::DenseTensor& index, phi::DenseTensor grad, bool include_self UNUSED, - const phi::DeviceContext& ctx UNUSED) { + const phi::DeviceContext& dev_ctx + UNUSED) { auto* index_data = index.data(); auto* grad_data = grad.data(); @@ -504,7 +536,7 @@ void cpu_scatter_value_grad_kernel(phi::DenseTensor self, const phi::DenseTensor& index, phi::DenseTensor grad, bool include_self UNUSED, - const phi::DeviceContext& ctx UNUSED) { + const phi::DeviceContext& dev_ctx UNUSED) { auto* self_data = self.data(); auto* index_data = index.data(); auto* grad_data = grad.data(); @@ -564,7 +596,7 @@ void cpu_scatter_add_mean_value_grad_kernel( phi::DenseTensor grad, const std::string& reduce, bool include_self, - const phi::DeviceContext& ctx UNUSED) { + const phi::DeviceContext& dev_ctx UNUSED) { auto* self_data = self.data(); auto* index_data = index.data(); auto* grad_data = grad.data(); @@ -643,16 +675,17 @@ void cpu_scatter_add_mean_value_grad_kernel( } template -void cpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self, - int dim, - const phi::DenseTensor& index, - const phi::DenseTensor& out, - const phi::DenseTensor& x, - const phi::DenseTensor& value, - phi::DenseTensor grad, - const std::string& reduce, - bool include_self, - const phi::DeviceContext& ctx) { +void cpu_scatter_mul_min_max_value_grad_kernel( + phi::DenseTensor self, + int dim, + const phi::DenseTensor& index, + const phi::DenseTensor& out, + const phi::DenseTensor& x, + const phi::DenseTensor& value, + phi::DenseTensor grad, + const std::string& reduce, + bool include_self, + const phi::DeviceContext& dev_ctx) { auto* self_data = self.data(); auto* index_data = index.data(); auto* grad_data = grad.data(); diff --git a/paddle/phi/kernels/funcs/math/cos_sim_functor.cc b/paddle/phi/kernels/funcs/math/cos_sim_functor.cc index 60f1b388de3ad0..cba26a884e72c2 100644 --- a/paddle/phi/kernels/funcs/math/cos_sim_functor.cc +++ b/paddle/phi/kernels/funcs/math/cos_sim_functor.cc @@ -18,7 +18,7 @@ namespace phi { namespace math { template struct CosSimDyFunctor { - void operator()(const phi::CPUContext& ctx, + void operator()(const phi::CPUContext& dev_ctx, const T* x_norm, const T* y_norm, const T* x, diff --git a/paddle/phi/kernels/funcs/math/cos_sim_functor.cu b/paddle/phi/kernels/funcs/math/cos_sim_functor.cu index 762178b4a9d613..f37fd91ee87efd 100644 --- a/paddle/phi/kernels/funcs/math/cos_sim_functor.cu +++ b/paddle/phi/kernels/funcs/math/cos_sim_functor.cu @@ -50,7 +50,7 @@ __global__ void CosSimDyKernel(const T* x_norm, template struct CosSimDyFunctor { - void operator()(const phi::GPUContext& ctx, + void operator()(const phi::GPUContext& dev_ctx, const T* x_norm, const T* y_norm, const T* x, @@ -63,7 +63,7 @@ struct CosSimDyFunctor { const int block_size = 512; dim3 threads(block_size, 1); dim3 grid((rows + block_size - 1) / block_size, 1); - CosSimDyKernel<<>>( + CosSimDyKernel<<>>( x_norm, y_norm, x, y, z, dz, rows, cols, dy); } }; diff --git a/paddle/phi/kernels/funcs/math/cos_sim_functor.h b/paddle/phi/kernels/funcs/math/cos_sim_functor.h index e01af90df4d4e1..ed2e71f8af8d8f 100644 --- a/paddle/phi/kernels/funcs/math/cos_sim_functor.h +++ b/paddle/phi/kernels/funcs/math/cos_sim_functor.h @@ -174,7 +174,7 @@ struct CosSimDxFunctor { template struct CosSimDyFunctor { - void operator()(const DeviceContext& ctx, + void operator()(const DeviceContext& dev_ctx, const T* x_norm, const T* y_norm, const T* x, diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h index 32afd7fdaa1b5d..a1da63a3ab9628 100644 --- a/paddle/phi/kernels/funcs/scatter.cu.h +++ b/paddle/phi/kernels/funcs/scatter.cu.h @@ -404,7 +404,7 @@ __global__ void scatter_gather_elementwise_kernel(int N, func_t f) { } template -void GPUScatterAdd(const phi::GPUContext& ctx, +void GPUScatterAdd(const phi::GPUContext& dev_ctx, const DenseTensor& src, const DenseTensor& index, DenseTensor* output, @@ -483,7 +483,7 @@ void GPUScatterAdd(const phi::GPUContext& ctx, constexpr int vt = 8; const dim3 block(nt); const dim3 grid((N + block.x * vt - 1) / (block.x * vt)); - auto stream = ctx.stream(); + auto stream = dev_ctx.stream(); scatter_gather_elementwise_kernel <<>>(N, reduce_add); diff --git a/paddle/phi/kernels/funcs/unique_functor.h b/paddle/phi/kernels/funcs/unique_functor.h index 758b9160096d09..fafb1b284c60a8 100644 --- a/paddle/phi/kernels/funcs/unique_functor.h +++ b/paddle/phi/kernels/funcs/unique_functor.h @@ -23,6 +23,35 @@ namespace phi { namespace funcs { +template +static bool NaNSafeEqual(const T& a, const T& b) { + if constexpr (std::is_floating_point_v) { + if (std::isnan(a) && std::isnan(b)) { + return &a == &b; + } + if (std::isnan(a) || std::isnan(b)) { + return false; + } + } + return a == b; +} + +template +static bool NaNSafeLess(const T& a, const T& b) { + if constexpr (std::is_floating_point_v) { + if (std::isnan(a) && !std::isnan(b)) { + return false; + } + if (!std::isnan(a) && std::isnan(b)) { + return true; + } + if (std::isnan(a) && std::isnan(b)) { + return &a < &b; + } + } + return a < b; +} + template struct UniqueOpFunctor { const Context& dev_ctx_; @@ -122,7 +151,7 @@ static bool Equal(const DenseTensor& a, const DenseTensor& b) { return false; } for (int64_t i = 0; i < a.numel(); ++i) { - if (a.data()[i] != b.data()[i]) { + if (!NaNSafeEqual(a.data()[i], b.data()[i])) { return false; } } @@ -140,7 +169,15 @@ static void UniqueFlattenedTensor(const Context& dev_ctx, bool return_inverse, bool return_counts) { const InT* in_data = in.data(); - std::set unique(in_data, in_data + in.numel()); + + auto nan_safe_comp = [](const InT& a, const InT& b) { + return NaNSafeLess(a, b); + }; + std::set unique(nan_safe_comp); + for (int64_t i = 0; i < in.numel(); ++i) { + unique.insert(in_data[i]); + } + out->Resize(common::make_ddim({static_cast(unique.size())})); auto* out_data = dev_ctx.template Alloc(out); std::copy(unique.begin(), unique.end(), out_data); @@ -162,29 +199,27 @@ static void UniqueFlattenedTensor(const Context& dev_ctx, if (return_inverse) { index->Resize(common::make_ddim({in.numel()})); auto inverse_data = dev_ctx.template Alloc(index); - std::unordered_map inverse_map; - inverse_map.reserve(out->numel()); - for (int64_t i = 0; i < out->numel(); ++i) { - inverse_map[out_data[i]] = i; - } for (int64_t i = 0; i < in.numel(); ++i) { - inverse_data[i] = inverse_map[in_data[i]]; + for (int64_t j = 0; j < out->numel(); ++j) { + if (NaNSafeEqual(in_data[i], out_data[j])) { + inverse_data[i] = j; + break; + } + } } } if (return_counts) { count->Resize(common::make_ddim({out->numel()})); auto count_data = dev_ctx.template Alloc(count); - std::unordered_map counts_map; - counts_map.reserve(out->numel()); for (int64_t i = 0; i < out->numel(); ++i) { - counts_map[out_data[i]] = 0; - } - for (int64_t i = 0; i < in.numel(); i++) { - counts_map[in_data[i]] += 1; - } - for (int64_t i = 0; i < out->numel(); i++) { - count_data[i] = counts_map[out_data[i]]; + IndexT cnt = 0; + for (int64_t j = 0; j < in.numel(); ++j) { + if (NaNSafeEqual(out_data[i], in_data[j])) { + cnt++; + } + } + count_data[i] = cnt; } } } diff --git a/paddle/phi/kernels/fusion/gpu/fused_transpose_split_quant_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_transpose_split_quant_kernel.cu index 23ddde393f3dd2..16503aa32f263d 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_transpose_split_quant_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_transpose_split_quant_kernel.cu @@ -29,43 +29,62 @@ struct __align__(sizeof(T) * VecSize) VecType { } }; -template -__device__ void BlockLoad(const phi::bfloat16* input, +template +__device__ void BlockLoad(const InT* input, + const float* input_scales, __nv_bfloat16 x[8][4], - size_t K) { + size_t K, + size_t k_scaled) { + constexpr bool need_dequant = std::is_same_v; + +#pragma unroll for (uint32_t i = 0; i < 8; i++) { - size_t off_m = blockIdx.x * size_t(128) + threadIdx.y + i * 16; - size_t off_k = blockIdx.y * 128 + threadIdx.x * VecSize; - size_t offset = off_m * K + off_k; + const uint32_t local_off_M = threadIdx.y + i * 16; + const uint32_t off_m = blockIdx.x * 128 + local_off_M; + const uint32_t off_k = blockIdx.y * 128 + threadIdx.x * VecSize; + const size_t offset = off_m * K + off_k; + + float scale; + if constexpr (need_dequant) { + const uint32_t m_base = blockIdx.x * 128; + const uint32_t m_stride = k_scaled; + scale = input_scales[off_m * m_stride + blockIdx.y]; + } +#pragma unroll for (uint32_t j = 0; j < 4; j += VecSize) { - if (off_k + j * 32 < K) { - size_t idx = offset + j * 32; - using LoadT = VecType<__nv_bfloat16, VecSize>; - LoadT data = *reinterpret_cast(input + idx); - for (uint32_t k = 0; k < VecSize; k++) { - x[i][j + k] = data[k]; + const size_t idx = offset + j * 32; + using LoadT = VecType; + LoadT data = *reinterpret_cast(input + idx); +#pragma unroll + for (uint32_t k = 0; k < VecSize; k++) { + if constexpr (need_dequant) { + x[i][j + k] = __float2bfloat16(static_cast(data[k]) * scale); + } else { + x[i][j + k] = (*reinterpret_cast<__nv_bfloat16*>(&data[k])); } } } } } - template __device__ void BlockColumnScale(const __nv_bfloat16 x[8][4], - float col_scale[128], + float scales[128], __nv_bfloat16* shm) { // reduce [(8), 16, 32, 4] => [16, 32, 4] __nv_bfloat16 warp_max[4]; +#pragma unroll for (uint32_t i = 0; i < 8; i++) { +#pragma unroll for (uint32_t j = 0; j < 4; j++) { - __nv_bfloat16 t = BF16_ABS(x[i][j]); + const __nv_bfloat16 t = BF16_ABS(x[i][j]); warp_max[j] = i == 0 ? t : BF16_MAX(warp_max[j], t); } } // reduce [(16), 32, 4] => [8, 32, 4] if (threadIdx.y >= 8) { +#pragma unroll for (uint32_t j = 0; j < 4; j++) { shm[(threadIdx.y - 8) * 128 + threadIdx.x + j * 32] = warp_max[j]; } @@ -75,8 +94,9 @@ __device__ void BlockColumnScale(const __nv_bfloat16 x[8][4], // reduce [(8), 32, 4] => [32, 4] for (uint32_t offset = 8; offset > 0; offset /= 2) { if (threadIdx.y < offset) { +#pragma unroll for (uint32_t j = 0; j < 4; j++) { - __nv_bfloat16 other = + const __nv_bfloat16 other = offset == 8 ? warp_max[j] : shm[(threadIdx.y + offset) * 128 + threadIdx.x + j * 32]; @@ -85,7 +105,7 @@ __device__ void BlockColumnScale(const __nv_bfloat16 x[8][4], if (offset > 1) { shm[threadIdx.y * 128 + threadIdx.x + j * 32] = next_val; } else { - col_scale[threadIdx.x + j * 32] = + scales[threadIdx.x + j * 32] = ComputeScale<__nv_bfloat16, __nv_fp8_e4m3, Pow2Scales>( static_cast(next_val), 0.0f); } @@ -98,7 +118,7 @@ __device__ void BlockColumnScale(const __nv_bfloat16 x[8][4], template __device__ void BlockStoreScale(float* scale, size_t off_m, - float col_scale[128], + float scales[128], size_t K) { if (threadIdx.y < 4) { uint32_t off = threadIdx.y * 32 + threadIdx.x; @@ -107,10 +127,10 @@ __device__ void BlockStoreScale(float* scale, } else if constexpr (VecSize == 2) { off = (off / 64) * 64 + (off % 2) * 32 + (off % 64) / 2; } - float scale_out = 1.0f / col_scale[off]; - size_t idx_y = blockIdx.x - off_m / 128; - size_t idx_x = blockIdx.y * 128 + threadIdx.y * 32 + threadIdx.x; - size_t idx = idx_y * K + idx_x; + float scale_out = 1.0f / scales[off]; + const size_t idx_y = blockIdx.x - off_m / 128; + const size_t idx_x = blockIdx.y * 128 + threadIdx.y * 32 + threadIdx.x; + const size_t idx = idx_y * K + idx_x; if (idx_x < K) { scale[idx] = scale_out; } @@ -123,14 +143,16 @@ __device__ void BlockStoreOut(OutT* out, size_t cur_tokens, const OutT shm[128][129], size_t K) { +#pragma unroll for (uint32_t i = 0; i < 8; i++) { - size_t idx_m = blockIdx.x * size_t(128) + threadIdx.x * 4; - size_t idx_k = blockIdx.y * 128 + threadIdx.y + i * 16; - size_t idx = idx_k * cur_tokens + (idx_m - off_m); + const size_t idx_m = blockIdx.x * size_t(128) + threadIdx.x * 4; + const size_t idx_k = blockIdx.y * 128 + threadIdx.y + i * 16; + const size_t idx = idx_k * cur_tokens + (idx_m - off_m); if (idx_k < K) { using StoreT = VecType; StoreT data; +#pragma unroll for (uint32_t j = 0; j < VecSize; j++) { data[j] = shm[i * 16 + threadIdx.y][threadIdx.x * 4 + j]; } @@ -139,23 +161,27 @@ __device__ void BlockStoreOut(OutT* out, } } -template +template __global__ void __launch_bounds__(512) - FusedTransposeSplitQuantKernel(const phi::bfloat16* __restrict__ input, + FusedTransposeSplitQuantKernel(const InT* __restrict__ input, + const float* __restrict__ input_scales, int64_t* __restrict__ meta, size_t num_experts, - size_t K) { + size_t K, + size_t k_scaled) { __shared__ OutT shm[128][129]; + __shared__ size_t expert_info[2]; + __shared__ float scales[128]; // May be reused? Is it worthy? + int64_t* tokens_per_expert = meta; OutT** out_ptrs = reinterpret_cast(meta + num_experts); float** scale_ptrs = reinterpret_cast(meta + num_experts * 2); // 1. Load 128x128 elements from input __nv_bfloat16 x[8][4]; - BlockLoad(input, x, K); + BlockLoad(input, input_scales, x, K, k_scaled); // 2. Get expert index and offset of the current block - __shared__ size_t expert_info[2]; if (threadIdx.x == 0 && threadIdx.y == 0) { size_t idx_m = blockIdx.x * size_t(128); size_t off_m = 0, next_off_m = 0; @@ -172,21 +198,23 @@ __global__ void __launch_bounds__(512) } // 3. Calculate scale along the column - __shared__ float col_scale[128]; BlockColumnScale( - x, col_scale, reinterpret_cast<__nv_bfloat16*>(shm)); + x, scales, reinterpret_cast<__nv_bfloat16*>(shm)); // 4. Store scale const size_t expert_idx = expert_info[0]; const size_t off_m = expert_info[1]; - BlockStoreScale(scale_ptrs[expert_idx], off_m, col_scale, K); + BlockStoreScale(scale_ptrs[expert_idx], off_m, scales, K); - // 5. Scale x and save into shared memory with transposed layout +// 5. Scale x and save into shared memory with transposed layout +#pragma unroll for (uint32_t i = 0; i < 8; i++) { +#pragma unroll for (uint32_t j = 0; j < 4; j += VecSize) { +#pragma unroll for (uint32_t k = 0; k < VecSize; k++) { float x_fp32 = static_cast(x[i][j + k]); - float x_scaled = x_fp32 * col_scale[threadIdx.x + (j + k) * 32]; + float x_scaled = x_fp32 * scales[threadIdx.x + (j + k) * 32]; shm[threadIdx.x * VecSize + j * 32 + k][i * 16 + threadIdx.y] = static_cast(x_scaled); } @@ -204,10 +232,11 @@ template void FusedTransposeSplitQuantKernel( const Context& dev_ctx, const DenseTensor& x, + const paddle::optional& input_scales, const std::vector& tokens_per_expert, bool pow_2_scales, std::vector outs, - std::vector scales) { + std::vector output_scales) { auto x_dims = x.dims(); const int64_t M = x_dims[0]; const int64_t K = x_dims[1]; @@ -221,8 +250,8 @@ void FusedTransposeSplitQuantKernel( if (outs[i] != nullptr) { dev_ctx.template Alloc(outs[i]); } - if (scales[i] != nullptr) { - dev_ctx.template Alloc(scales[i]); + if (output_scales[i] != nullptr) { + dev_ctx.template Alloc(output_scales[i]); } } @@ -245,8 +274,8 @@ void FusedTransposeSplitQuantKernel( for (size_t i = 0; i < num_experts; i++) { meta_ptr[num_experts * 2 + i] = - scales[i] != nullptr - ? reinterpret_cast(scales[i]->data()) + output_scales[i] != nullptr + ? reinterpret_cast(output_scales[i]->data()) : 0; } @@ -255,23 +284,35 @@ void FusedTransposeSplitQuantKernel( auto stream = dev_ctx.stream(); - dim3 grid(M / 128, (K + 127) / 128); + // pre-compute on CPU to reduce size_t division cost in kernel + const size_t k_scaled = (K + 127) / 128; + dim3 grid(M / 128, k_scaled); dim3 block(32, 16); -#define LAUNCH_KERNEL(POW_2_SCALES, VEC_SIZE) \ - FusedTransposeSplitQuantKernel \ - <<>>(x.data(), \ - meta_gpu.data(), \ - num_experts, \ - K); +#define DTYPE_CASE(dtype, type) dtype == phi::DataType::type +#define LAUNCH_KERNEL(T, POW_2_SCALES, VEC_SIZE) \ + FusedTransposeSplitQuantKernel<<>>( \ + x.data(), \ + input_scales ? input_scales.get_ptr()->data() : nullptr, \ + meta_gpu.data(), \ + num_experts, \ + K, \ + k_scaled); +#define DISPATCH_DATATYPE(POW_2_SCALES, VEC_SIZE) \ + if (DTYPE_CASE(x.dtype(), BFLOAT16)) { \ + LAUNCH_KERNEL(phi::bfloat16, POW_2_SCALES, VEC_SIZE); \ + } else if (DTYPE_CASE(x.dtype(), FLOAT8_E4M3FN)) { \ + LAUNCH_KERNEL(phi::float8_e4m3fn, POW_2_SCALES, VEC_SIZE); \ + } #define LAUNCH_KERNEL_PARTIAL(VEC_SIZE) \ if (pow_2_scales) { \ - LAUNCH_KERNEL(true, VEC_SIZE); \ + DISPATCH_DATATYPE(true, VEC_SIZE); \ } else { \ - LAUNCH_KERNEL(false, VEC_SIZE); \ + DISPATCH_DATATYPE(false, VEC_SIZE); \ } if (K % 4 == 0) { @@ -296,7 +337,8 @@ PD_REGISTER_KERNEL(fused_transpose_split_quant, double, int, int64_t, - phi::dtype::bfloat16) { + phi::dtype::bfloat16, + phi::dtype::float8_e4m3fn) { kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT8_E4M3FN); kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); } diff --git a/paddle/phi/kernels/gpu/cuda_gemm_kernel.h b/paddle/phi/kernels/gpu/cuda_gemm_kernel.h index f13831bc25034b..0efe77d7817dc0 100644 --- a/paddle/phi/kernels/gpu/cuda_gemm_kernel.h +++ b/paddle/phi/kernels/gpu/cuda_gemm_kernel.h @@ -26,7 +26,7 @@ typedef struct { } GemmParams; template -void CudaGemm(const Context& ctx, +void CudaGemm(const Context& dev_ctx, const DenseTensor& input, const DenseTensor& w, DenseTensor* output); diff --git a/paddle/phi/kernels/gpu/min_max_with_index_grad_kernel.cu b/paddle/phi/kernels/gpu/min_max_with_index_grad_kernel.cu new file mode 100644 index 00000000000000..f34d03bf07e506 --- /dev/null +++ b/paddle/phi/kernels/gpu/min_max_with_index_grad_kernel.cu @@ -0,0 +1,115 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/gather_scatter_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +using EnableIfInteger = + typename std::enable_if::value, int>::type; + +template +using EnableIfNonInteger = + typename std::enable_if::value, int>::type; + +// Here if keepdim=True, this will fallback to a simplified version of +// take_along_axis. However, if keepdim=False (by default), indices will +// not have equal rank will the input values (and values_grad), therefore +// needs an unsqueeze operation by shallow copying indices and Resize +#define DEFINE_WITH_INDEX_GRAD_KERNEL(OpType) \ + template = 0> \ + void OpType##WithIndexGradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& values, \ + const DenseTensor& indices, \ + const DenseTensor& values_grad, \ + const Scalar& dim, \ + bool keepdim, \ + DenseTensor* x_grad) { \ + x_grad->Resize(x.dims()); \ + dev_ctx.template Alloc(x_grad); \ + if (x_grad->numel() == 0) { \ + return; \ + } \ + int64_t dim_val = dim.to(); \ + if (dim_val < 0) { \ + dim_val += x.dims().size(); \ + } \ + DenseTensor shallow_copied_inds(indices); \ + if (!keepdim) { \ + auto indices_dim = x.dims(); \ + indices_dim[dim_val] = 1; \ + shallow_copied_inds.Resize(indices_dim); \ + } \ + phi::funcs::SetConstant functor; \ + functor(dev_ctx, x_grad, static_cast(0)); \ + phi::funcs::gpu_scatter_add_kernel( \ + *x_grad, dim_val, shallow_copied_inds, values_grad, true, dev_ctx); \ + } \ + template = 0> \ + void OpType##WithIndexGradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& values, \ + const DenseTensor& indices, \ + const DenseTensor& values_grad, \ + const Scalar& dim, \ + bool keepdim, \ + DenseTensor* x_grad) { \ + std::string dtype_name = phi::DataTypeToString(values.dtype()); \ + PADDLE_ENFORCE_EQ( \ + 0, \ + 1, \ + phi::errors::InvalidArgument( \ + "Integer type '%s' is not allowed to have stop_gradient=False.", \ + dtype_name.c_str())); \ + } + +DEFINE_WITH_INDEX_GRAD_KERNEL(Max) +DEFINE_WITH_INDEX_GRAD_KERNEL(Min) + +#undef DEFINE_WITH_INDEX_GRAD_KERNEL + +} // namespace phi + +PD_REGISTER_KERNEL(max_with_index_grad, + GPU, + ALL_LAYOUT, + phi::MaxWithIndexGradKernel, + float, + double, + uint8_t, + int, + int16_t, + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(min_with_index_grad, + GPU, + ALL_LAYOUT, + phi::MinWithIndexGradKernel, + float, + double, + uint8_t, + int, + int16_t, + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/min_max_with_index_kernel.cu b/paddle/phi/kernels/gpu/min_max_with_index_kernel.cu new file mode 100644 index 00000000000000..2509c34fb0c8fd --- /dev/null +++ b/paddle/phi/kernels/gpu/min_max_with_index_kernel.cu @@ -0,0 +1,312 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/min_max_with_index_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#if defined(__NVCC__) || defined(__HIPCC__) + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif +#include + +#include "paddle/common/ddim.h" +#include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/math_function.h" +namespace phi { + +namespace { // NOLINT +template +using KeyValuePair = cub::KeyValuePair; + +} // namespace + +#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ + case (1 << (log2_block_dim)): { \ + constexpr auto kBlockDim = (1 << (log2_block_dim)); \ + __VA_ARGS__; \ + } break + +#define FIXED_BLOCK_DIM_CASE(...) \ + FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); + +template +__global__ void MinMaxWithIndexKernel(const int64_t height, // n * h + const int64_t width, // c + const int64_t post_size, // h + const Reducer reducer, + const T init, + const T* in, + T* val_out, + IndType* key_out) { + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + for (IndexType idx = blockIdx.x; idx < height; idx += gridDim.x) { + KeyValuePair kv_pair = {-1, init}; + IndexType h = idx / post_size; + IndexType w = idx % post_size; + for (IndexType k = threadIdx.x; k < width; k += blockDim.x) { + kv_pair = + reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair); + } + kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer); + if (threadIdx.x == 0) { + val_out[idx] = static_cast(kv_pair.value); + key_out[idx] = static_cast(kv_pair.key); + } + __syncthreads(); + } +} + +template +void ComputeMinMaxWithIndex(const phi::GPUContext& dev_ctx, + const DenseTensor& input, + DenseTensor* values, + DenseTensor* indices, + const int64_t pre, + const int64_t post, + const int64_t n) { + auto cu_stream = dev_ctx.stream(); + auto ComputeBlockSize = [](int64_t col) { + auto block_size = 8; + if (col > 512) + block_size = 1024; + else if (col > 256) + block_size = 512; + else if (col > 128) + block_size = 256; + else if (col > 64) + block_size = 128; + else if (col > 32) + block_size = 64; + else if (col > 16) + block_size = 32; + else if (col > 8) + block_size = 16; + return block_size; + }; + + int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; + int64_t height = pre * post; + int64_t width = n; + int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx; + + const T* in_data = input.data(); + + T* val_data = dev_ctx.template Alloc(values); + IndType* ind_data = dev_ctx.template Alloc(indices); + + if (typeid(Reducer) == typeid(cub::ArgMax)) { + switch (ComputeBlockSize(width)) { + FIXED_BLOCK_DIM_CASE( + MinMaxWithIndexKernel + <<>>( + height, + width, + post, + Reducer(), + std::numeric_limits::lowest(), + in_data, + val_data, + ind_data)); + } + } else { + switch (ComputeBlockSize(width)) { + FIXED_BLOCK_DIM_CASE( + MinMaxWithIndexKernel + <<>>( + height, + width, + post, + Reducer(), + std::numeric_limits::max(), + in_data, + val_data, + ind_data)); + } + } +} + +template +struct VisitDataCudaMinMaxWithIndexFunctor { + const Context& dev_ctx; + const DenseTensor& x; + int64_t axis; + bool keepdims; + bool flatten; + DenseTensor* val_out; + DenseTensor* ind_out; + + explicit VisitDataCudaMinMaxWithIndexFunctor(const Context& dev_ctx, + const DenseTensor& x, + int64_t axis, + bool keepdims, + bool flatten, + DenseTensor* val_out, + DenseTensor* ind_out) + : dev_ctx(dev_ctx), + x(x), + axis(axis), + keepdims(keepdims), + flatten(flatten), + val_out(val_out), + ind_out(ind_out) {} + + template + void apply() const { + phi::DDim x_dims; + int new_axis = axis; + if (flatten) { + x_dims = common::make_ddim({x.numel()}); + // if flatten, the axis just as 0 + new_axis = 0; + } else { + x_dims = x.dims(); + if (axis < 0) new_axis = axis + x.dims().size(); + } + if (x.numel() == 0) { + dev_ctx.template Alloc(val_out); + dev_ctx.template Alloc(ind_out); + return; + } + // For 0D Tensor + if (x.dims().size() == 0) { + dev_ctx.template Alloc(val_out); + dev_ctx.template Alloc(ind_out); + phi::funcs::set_constant(dev_ctx, ind_out, static_cast(0)); + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, val_out); + return; + } + + int64_t numel = x.numel(); + int64_t groups = numel / x_dims[new_axis]; + int64_t pre = 1; + int64_t post = 1; + int64_t n = x_dims[new_axis]; + + for (int i = 0; i < new_axis; i++) { + pre *= x_dims[i]; + } + + for (int i = new_axis + 1; i < x_dims.size(); i++) { + post *= x_dims[i]; + } + + if (numel > std::numeric_limits::max()) { + ComputeMinMaxWithIndex( + dev_ctx, x, val_out, ind_out, pre, post, n); + } else { + ComputeMinMaxWithIndex( + dev_ctx, x, val_out, ind_out, pre, post, n); + } + } +}; + +template +void MinMaxWithIndexOpCUDAKernel(const Context& dev_ctx, + const DenseTensor& x, + const Scalar& axis, + bool keepdims, + bool flatten, + DenseTensor* val_out, + DenseTensor* ind_out) { + PADDLE_ENFORCE_GE( + x.numel(), + 0, + common::errors::InvalidArgument( + "(min/max)_with_index input numel must > 0, bug got %d", x.numel())); + phi::VisitDataTypeTiny( + phi::DataType::INT64, + VisitDataCudaMinMaxWithIndexFunctor( + dev_ctx, x, axis.to(), keepdims, flatten, val_out, ind_out)); +} + +template +void MinWithIndexKernel(const Context& dev_ctx, + const DenseTensor& x, + const Scalar& dim, + bool keepdim, + bool flatten, + DenseTensor* val_out, + DenseTensor* ind_out) { + MinMaxWithIndexOpCUDAKernel( + dev_ctx, x, dim, keepdim, flatten, val_out, ind_out); +} + +template +void MaxWithIndexKernel(const Context& dev_ctx, + const DenseTensor& x, + const Scalar& dim, + bool keepdim, + bool flatten, + DenseTensor* val_out, + DenseTensor* ind_out) { + MinMaxWithIndexOpCUDAKernel( + dev_ctx, x, dim, keepdim, flatten, val_out, ind_out); +} + +#endif + +} // namespace phi + +PD_REGISTER_KERNEL(min_with_index, + GPU, + ALL_LAYOUT, + phi::MinWithIndexKernel, + phi::dtype::float16, + phi::dtype::bfloat16, + float, + double, + int32_t, + int64_t, + int16_t, + uint8_t) { + kernel->OutputAt(0).SetDataType(kernel->InputAt(0).dtype); + kernel->OutputAt(1).SetDataType(phi::DataType::INT64); +} + +PD_REGISTER_KERNEL(max_with_index, + GPU, + ALL_LAYOUT, + phi::MaxWithIndexKernel, + phi::dtype::float16, + phi::dtype::bfloat16, + float, + double, + int32_t, + int64_t, + int16_t, + uint8_t) { + kernel->OutputAt(0).SetDataType(kernel->InputAt(0).dtype); + kernel->OutputAt(1).SetDataType(phi::DataType::INT64); +} diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu index 95132d09e2cc22..3f55297474015c 100644 --- a/paddle/phi/kernels/gpu/reduce_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_kernel.cu @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/reduce_kernel.h" +#include #include "paddle/phi/kernels/gpu/reduce_amin_amax_common.h" #include "paddle/phi/kernels/reduce_amin_grad_kernel.h" diff --git a/paddle/phi/kernels/index_elementwise_get_grad_kernel.h b/paddle/phi/kernels/index_elementwise_get_grad_kernel.h index 42550bbc08de70..f5d9c3a2847d05 100644 --- a/paddle/phi/kernels/index_elementwise_get_grad_kernel.h +++ b/paddle/phi/kernels/index_elementwise_get_grad_kernel.h @@ -20,7 +20,7 @@ namespace phi { template -void IndexElementwiseGetGradKernel(const Context& ctx, +void IndexElementwiseGetGradKernel(const Context& dev_ctx, const DenseTensor& x, const std::vector& index, const DenseTensor& out_grad, diff --git a/paddle/phi/kernels/legacy/compare_kernel.h b/paddle/phi/kernels/legacy/compare_kernel.h index 541ec10d244da4..95ea7081a1cfa3 100644 --- a/paddle/phi/kernels/legacy/compare_kernel.h +++ b/paddle/phi/kernels/legacy/compare_kernel.h @@ -19,42 +19,42 @@ limitations under the License. */ namespace phi { template -void LessThanRawKernel(const Context& ctx, +void LessThanRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, DenseTensor* out); template -void LessEqualRawKernel(const Context& ctx, +void LessEqualRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, DenseTensor* out); template -void GreaterThanRawKernel(const Context& ctx, +void GreaterThanRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, DenseTensor* out); template -void GreaterEqualRawKernel(const Context& ctx, +void GreaterEqualRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, DenseTensor* out); template -void EqualRawKernel(const Context& ctx, +void EqualRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, DenseTensor* out); template -void NotEqualRawKernel(const Context& ctx, +void NotEqualRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, diff --git a/paddle/phi/kernels/legacy/cpu/compare_kernel.cc b/paddle/phi/kernels/legacy/cpu/compare_kernel.cc index 5b11c81f573a80..77800701c94b26 100644 --- a/paddle/phi/kernels/legacy/cpu/compare_kernel.cc +++ b/paddle/phi/kernels/legacy/cpu/compare_kernel.cc @@ -25,23 +25,23 @@ template -inline void CompareRawKernelImpl(const Context& ctx, +inline void CompareRawKernelImpl(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, DenseTensor* out) { - ctx.template Alloc(out); + dev_ctx.template Alloc(out); if (x.dims().size() >= y.dims().size()) { funcs::ElementwiseCompute( - ctx, x, y, Functor(), out, axis); + dev_ctx, x, y, Functor(), out, axis); } else { funcs::ElementwiseCompute( - ctx, x, y, InverseFunctor(), out, axis); + dev_ctx, x, y, InverseFunctor(), out, axis); } } template -void LessThanRawKernel(const Context& ctx, +void LessThanRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, @@ -49,11 +49,11 @@ void LessThanRawKernel(const Context& ctx, CompareRawKernelImpl, - funcs::GreaterThanFunctor>(ctx, x, y, axis, out); + funcs::GreaterThanFunctor>(dev_ctx, x, y, axis, out); } template -void LessEqualRawKernel(const Context& ctx, +void LessEqualRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, @@ -61,11 +61,11 @@ void LessEqualRawKernel(const Context& ctx, CompareRawKernelImpl, - funcs::GreaterEqualFunctor>(ctx, x, y, axis, out); + funcs::GreaterEqualFunctor>(dev_ctx, x, y, axis, out); } template -void GreaterThanRawKernel(const Context& ctx, +void GreaterThanRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, @@ -73,10 +73,10 @@ void GreaterThanRawKernel(const Context& ctx, CompareRawKernelImpl, - funcs::LessThanFunctor>(ctx, x, y, axis, out); + funcs::LessThanFunctor>(dev_ctx, x, y, axis, out); } template -void GreaterEqualRawKernel(const Context& ctx, +void GreaterEqualRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, @@ -84,10 +84,10 @@ void GreaterEqualRawKernel(const Context& ctx, CompareRawKernelImpl, - funcs::LessEqualFunctor>(ctx, x, y, axis, out); + funcs::LessEqualFunctor>(dev_ctx, x, y, axis, out); } template -void EqualRawKernel(const Context& ctx, +void EqualRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, @@ -95,10 +95,10 @@ void EqualRawKernel(const Context& ctx, CompareRawKernelImpl, - funcs::EqualFunctor>(ctx, x, y, axis, out); + funcs::EqualFunctor>(dev_ctx, x, y, axis, out); } template -void NotEqualRawKernel(const Context& ctx, +void NotEqualRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, @@ -106,7 +106,7 @@ void NotEqualRawKernel(const Context& ctx, CompareRawKernelImpl, - funcs::NotEqualFunctor>(ctx, x, y, axis, out); + funcs::NotEqualFunctor>(dev_ctx, x, y, axis, out); } } // namespace phi diff --git a/paddle/phi/kernels/legacy/cpu/legacy_generate_proposals_kernel.cc b/paddle/phi/kernels/legacy/cpu/legacy_generate_proposals_kernel.cc index 5e6249249b1ee9..a77372f4592020 100644 --- a/paddle/phi/kernels/legacy/cpu/legacy_generate_proposals_kernel.cc +++ b/paddle/phi/kernels/legacy/cpu/legacy_generate_proposals_kernel.cc @@ -28,7 +28,7 @@ namespace phi { template std::pair ProposalForOneImage( - const phi::CPUContext &ctx, + const phi::CPUContext &dev_ctx, const phi::DenseTensor &im_info_slice, const phi::DenseTensor &anchors, const phi::DenseTensor &variances, @@ -44,7 +44,7 @@ std::pair ProposalForOneImage( // Sort index phi::DenseTensor index_t; index_t.Resize({scores_slice.numel()}); - int *index = ctx.Alloc(&index_t); + int *index = dev_ctx.Alloc(&index_t); for (int i = 0; i < scores_slice.numel(); ++i) { index[i] = i; } @@ -65,53 +65,54 @@ std::pair ProposalForOneImage( bbox_sel.Resize({index_t.numel(), 4}); anchor_sel.Resize({index_t.numel(), 4}); var_sel.Resize({index_t.numel(), 4}); - ctx.Alloc(&scores_sel); - ctx.Alloc(&bbox_sel); - ctx.Alloc(&anchor_sel); - ctx.Alloc(&var_sel); + dev_ctx.Alloc(&scores_sel); + dev_ctx.Alloc(&bbox_sel); + dev_ctx.Alloc(&anchor_sel); + dev_ctx.Alloc(&var_sel); - phi::funcs::CPUGather(ctx, scores_slice, index_t, &scores_sel); - phi::funcs::CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); - phi::funcs::CPUGather(ctx, anchors, index_t, &anchor_sel); - phi::funcs::CPUGather(ctx, variances, index_t, &var_sel); + phi::funcs::CPUGather(dev_ctx, scores_slice, index_t, &scores_sel); + phi::funcs::CPUGather(dev_ctx, bbox_deltas_slice, index_t, &bbox_sel); + phi::funcs::CPUGather(dev_ctx, anchors, index_t, &anchor_sel); + phi::funcs::CPUGather(dev_ctx, variances, index_t, &var_sel); phi::DenseTensor proposals; proposals.Resize({index_t.numel(), 4}); - ctx.Alloc(&proposals); - phi::funcs::BoxCoder(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals); + dev_ctx.Alloc(&proposals); + phi::funcs::BoxCoder( + dev_ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals); phi::funcs::ClipTiledBoxes( - ctx, im_info_slice, proposals, &proposals, false); + dev_ctx, im_info_slice, proposals, &proposals, false); phi::DenseTensor keep; phi::funcs::FilterBoxes( - ctx, &proposals, min_size, im_info_slice, true, &keep); + dev_ctx, &proposals, min_size, im_info_slice, true, &keep); // Handle the case when there is no keep index left if (keep.numel() == 0) { phi::funcs::SetConstant set_zero; bbox_sel.Resize({1, 4}); - ctx.Alloc(&bbox_sel); - set_zero(ctx, &bbox_sel, static_cast(0)); + dev_ctx.Alloc(&bbox_sel); + set_zero(dev_ctx, &bbox_sel, static_cast(0)); phi::DenseTensor scores_filter; scores_filter.Resize({1, 1}); - ctx.Alloc(&scores_filter); - set_zero(ctx, &scores_filter, static_cast(0)); + dev_ctx.Alloc(&scores_filter); + set_zero(dev_ctx, &scores_filter, static_cast(0)); return std::make_pair(bbox_sel, scores_filter); } phi::DenseTensor scores_filter; bbox_sel.Resize({keep.numel(), 4}); scores_filter.Resize({keep.numel(), 1}); - ctx.Alloc(&bbox_sel); - ctx.Alloc(&scores_filter); - phi::funcs::CPUGather(ctx, proposals, keep, &bbox_sel); - phi::funcs::CPUGather(ctx, scores_sel, keep, &scores_filter); + dev_ctx.Alloc(&bbox_sel); + dev_ctx.Alloc(&scores_filter); + phi::funcs::CPUGather(dev_ctx, proposals, keep, &bbox_sel); + phi::funcs::CPUGather(dev_ctx, scores_sel, keep, &scores_filter); if (nms_thresh <= 0) { return std::make_pair(bbox_sel, scores_filter); } phi::DenseTensor keep_nms = - phi::funcs::NMS(ctx, &bbox_sel, &scores_filter, nms_thresh, eta); + phi::funcs::NMS(dev_ctx, &bbox_sel, &scores_filter, nms_thresh, eta); if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { keep_nms.Resize({post_nms_top_n}); @@ -119,10 +120,10 @@ std::pair ProposalForOneImage( proposals.Resize({keep_nms.numel(), 4}); scores_sel.Resize({keep_nms.numel(), 1}); - ctx.Alloc(&proposals); - ctx.Alloc(&scores_sel); - phi::funcs::CPUGather(ctx, bbox_sel, keep_nms, &proposals); - phi::funcs::CPUGather(ctx, scores_filter, keep_nms, &scores_sel); + dev_ctx.Alloc(&proposals); + dev_ctx.Alloc(&scores_sel); + phi::funcs::CPUGather(dev_ctx, bbox_sel, keep_nms, &proposals); + phi::funcs::CPUGather(dev_ctx, scores_filter, keep_nms, &scores_sel); return std::make_pair(proposals, scores_sel); } diff --git a/paddle/phi/kernels/legacy/cpu/one_hot_kernel.cc b/paddle/phi/kernels/legacy/cpu/one_hot_kernel.cc index 85347e71c606ff..d475c5fec98d94 100644 --- a/paddle/phi/kernels/legacy/cpu/one_hot_kernel.cc +++ b/paddle/phi/kernels/legacy/cpu/one_hot_kernel.cc @@ -25,20 +25,20 @@ struct OneHotV2OpFunctor { const DenseTensor* in_; DenseTensor* out_; int depth_; - const DeviceContext& ctx_; + const DeviceContext& dev_ctx_; OneHotV2OpFunctor(const DenseTensor* in, DenseTensor* out, int depth, - const DeviceContext& ctx) - : in_(in), out_(out), depth_(depth), ctx_(ctx) {} + const DeviceContext& dev_ctx) + : in_(in), out_(out), depth_(depth), dev_ctx_(dev_ctx) {} template void apply() const { auto* p_in_data = in_->data(); auto numel = in_->numel(); - auto* p_out_data = ctx_.template Alloc(out_); - funcs::set_constant(ctx_, out_, 0.0); + auto* p_out_data = dev_ctx_.template Alloc(out_); + funcs::set_constant(dev_ctx_, out_, 0.0); for (int i = 0; i < numel; ++i) { PADDLE_ENFORCE_GE( diff --git a/paddle/phi/kernels/legacy/gpu/layer_norm_cuda_kernel.h b/paddle/phi/kernels/legacy/gpu/layer_norm_cuda_kernel.h index 39bd2837b9e451..f6d81228b34b68 100644 --- a/paddle/phi/kernels/legacy/gpu/layer_norm_cuda_kernel.h +++ b/paddle/phi/kernels/legacy/gpu/layer_norm_cuda_kernel.h @@ -943,7 +943,7 @@ void HostApplyRMSNorm(V* output, } template -void cuda_rms_norm(const Context& ctx, +void cuda_rms_norm(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& scale, int rows, @@ -960,7 +960,7 @@ void cuda_rms_norm(const Context& ctx, cols, \ epsilon, \ const_cast(scale.data()), \ - ctx.stream()) + dev_ctx.stream()) // scale.dtype() same as y->dtype() if (scale.dtype() == phi::DataType::FLOAT32) { DISPATCH_FWD_CASE(float); @@ -971,7 +971,7 @@ void cuda_rms_norm(const Context& ctx, } template -void HostRMSNormGradient(const Context& ctx, +void HostRMSNormGradient(const Context& dev_ctx, const V* dout, const U* invvar, const DenseTensor& input, @@ -992,7 +992,7 @@ void HostRMSNormGradient(const Context& ctx, const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b; auto place = input.place(); DenseTensor part_grad_gamma = - phi::Empty(ctx, {part_size, n2}); + phi::Empty(dev_ctx, {part_size, n2}); cuComputePartGradGammaBeta<<>>( dout, input.data(), @@ -1038,7 +1038,7 @@ void HostRMSNormGradient(const Context& ctx, } template -void cuda_rms_norm_gradient(const Context& ctx, +void cuda_rms_norm_gradient(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& scale, const DenseTensor& invvar, @@ -1050,7 +1050,7 @@ void cuda_rms_norm_gradient(const Context& ctx, DenseTensor* grad_scale) { #define DISPATCH_BWD_CASE(scalar_t_out) \ HostRMSNormGradient( \ - ctx, \ + dev_ctx, \ dy.data(), \ invvar.data(), \ x, \ @@ -1060,7 +1060,7 @@ void cuda_rms_norm_gradient(const Context& ctx, epsilon, \ grad_x->data(), \ grad_scale->data(), \ - ctx.stream()) + dev_ctx.stream()) if (scale.dtype() == phi::DataType::FLOAT32) { DISPATCH_BWD_CASE(float); } else if (scale.dtype() == phi::DataType::BFLOAT16) { diff --git a/paddle/phi/kernels/legacy/gpu/legacy_generate_proposals_kernel.cu b/paddle/phi/kernels/legacy/gpu/legacy_generate_proposals_kernel.cu index c7630a3717a41f..90e1a9f1c498aa 100644 --- a/paddle/phi/kernels/legacy/gpu/legacy_generate_proposals_kernel.cu +++ b/paddle/phi/kernels/legacy/gpu/legacy_generate_proposals_kernel.cu @@ -30,7 +30,7 @@ namespace phi { namespace { template static std::pair ProposalForOneImage( - const phi::GPUContext &ctx, + const phi::GPUContext &dev_ctx, const phi::DenseTensor &im_info, const phi::DenseTensor &anchors, const phi::DenseTensor &variances, @@ -43,7 +43,7 @@ static std::pair ProposalForOneImage( float eta) { // 1. pre nms phi::DenseTensor scores_sort, index_sort; - phi::funcs::SortDescending(ctx, scores, &scores_sort, &index_sort); + phi::funcs::SortDescending(dev_ctx, scores, &scores_sort, &index_sort); int num = scores.numel(); int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel() : pre_nms_top_n; @@ -53,10 +53,10 @@ static std::pair ProposalForOneImage( // 2. box decode and clipping phi::DenseTensor proposals; proposals.Resize({pre_nms_num, 4}); - ctx.Alloc(&proposals); + dev_ctx.Alloc(&proposals); { - phi::funcs::ForRange for_range(ctx, pre_nms_num); + phi::funcs::ForRange for_range(dev_ctx, pre_nms_num); for_range(phi::funcs::BoxDecodeAndClipFunctor{anchors.data(), bbox_deltas.data(), variances.data(), @@ -69,10 +69,10 @@ static std::pair ProposalForOneImage( phi::DenseTensor keep_index, keep_num_t; keep_index.Resize({pre_nms_num}); keep_num_t.Resize({1}); - ctx.Alloc(&keep_index); - ctx.Alloc(&keep_num_t); + dev_ctx.Alloc(&keep_index); + dev_ctx.Alloc(&keep_num_t); min_size = std::max(min_size, 1.0f); - auto stream = ctx.stream(); + auto stream = dev_ctx.stream(); phi::funcs::FilterBBoxes <<<1, 512, 0, stream>>>(proposals.data(), im_info.data(), @@ -81,14 +81,14 @@ static std::pair ProposalForOneImage( keep_num_t.data(), keep_index.data()); int keep_num; - const auto gpu_place = ctx.GetPlace(); + const auto gpu_place = dev_ctx.GetPlace(); phi::memory_utils::Copy(phi::CPUPlace(), &keep_num, gpu_place, keep_num_t.data(), sizeof(int), - ctx.stream()); - ctx.Wait(); + dev_ctx.stream()); + dev_ctx.Wait(); keep_index.Resize({keep_num}); phi::DenseTensor scores_filter, proposals_filter; @@ -97,18 +97,18 @@ static std::pair ProposalForOneImage( phi::funcs::SetConstant set_zero; proposals_filter.Resize({1, 4}); scores_filter.Resize({1, 1}); - ctx.Alloc(&proposals_filter); - ctx.Alloc(&scores_filter); - set_zero(ctx, &proposals_filter, static_cast(0)); - set_zero(ctx, &scores_filter, static_cast(0)); + dev_ctx.Alloc(&proposals_filter); + dev_ctx.Alloc(&scores_filter); + set_zero(dev_ctx, &proposals_filter, static_cast(0)); + set_zero(dev_ctx, &scores_filter, static_cast(0)); return std::make_pair(proposals_filter, scores_filter); } proposals_filter.Resize({keep_num, 4}); scores_filter.Resize({keep_num, 1}); - ctx.Alloc(&proposals_filter); - ctx.Alloc(&scores_filter); - phi::funcs::GPUGather(ctx, proposals, keep_index, &proposals_filter); - phi::funcs::GPUGather(ctx, scores_sort, keep_index, &scores_filter); + dev_ctx.Alloc(&proposals_filter); + dev_ctx.Alloc(&scores_filter); + phi::funcs::GPUGather(dev_ctx, proposals, keep_index, &proposals_filter); + phi::funcs::GPUGather(dev_ctx, scores_sort, keep_index, &scores_filter); if (nms_thresh <= 0) { return std::make_pair(proposals_filter, scores_filter); @@ -116,7 +116,8 @@ static std::pair ProposalForOneImage( // 4. nms phi::DenseTensor keep_nms; - phi::funcs::NMS(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms); + phi::funcs::NMS( + dev_ctx, proposals_filter, keep_index, nms_thresh, &keep_nms); if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { keep_nms.Resize({post_nms_top_n}); } @@ -124,10 +125,10 @@ static std::pair ProposalForOneImage( phi::DenseTensor scores_nms, proposals_nms; proposals_nms.Resize({keep_nms.numel(), 4}); scores_nms.Resize({keep_nms.numel(), 1}); - ctx.Alloc(&proposals_nms); - ctx.Alloc(&scores_nms); - phi::funcs::GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); - phi::funcs::GPUGather(ctx, scores_filter, keep_nms, &scores_nms); + dev_ctx.Alloc(&proposals_nms); + dev_ctx.Alloc(&scores_nms); + phi::funcs::GPUGather(dev_ctx, proposals_filter, keep_nms, &proposals_nms); + phi::funcs::GPUGather(dev_ctx, scores_filter, keep_nms, &scores_nms); return std::make_pair(proposals_nms, scores_nms); } diff --git a/paddle/phi/kernels/legacy/gpu/one_hot_kernel.cu b/paddle/phi/kernels/legacy/gpu/one_hot_kernel.cu index c64f2e2d755662..8030231e7fa025 100644 --- a/paddle/phi/kernels/legacy/gpu/one_hot_kernel.cu +++ b/paddle/phi/kernels/legacy/gpu/one_hot_kernel.cu @@ -44,24 +44,24 @@ template struct OneHotV2OpCUDAFunctor { const DenseTensor* in_; DenseTensor* out_; - const DeviceContext& ctx_; + const DeviceContext& dev_ctx_; int depth_; OneHotV2OpCUDAFunctor(const DenseTensor* in, DenseTensor* out, int depth, - const DeviceContext& ctx) - : in_(in), out_(out), depth_(depth), ctx_(ctx) {} + const DeviceContext& dev_ctx) + : in_(in), out_(out), depth_(depth), dev_ctx_(dev_ctx) {} template void apply() const { auto* p_in_data = in_->data(); auto numel = in_->numel(); - auto* p_out_data = ctx_.template Alloc(out_); - auto stream = ctx_.stream(); - funcs::set_constant(ctx_, out_, 0.0); + auto* p_out_data = dev_ctx_.template Alloc(out_); + auto stream = dev_ctx_.stream(); + funcs::set_constant(dev_ctx_, out_, 0.0); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx_, numel); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx_, numel); FillOutputKernel<< -inline void CompareRawKernelImpl(const Context& ctx, +inline void CompareRawKernelImpl(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, DenseTensor* out) { - ctx.template Alloc(out); + dev_ctx.template Alloc(out); out->set_type(phi::DataType::BOOL); if (out->numel() == 0) return; std::vector ins{&x, &y}; std::vector outs{out}; - funcs::BroadcastKernel(ctx, ins, &outs, Functor(), axis); + funcs::BroadcastKernel(dev_ctx, ins, &outs, Functor(), axis); } template -void LessThanRawKernel(const Context& ctx, +void LessThanRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, DenseTensor* out) { CompareRawKernelImpl>( - ctx, x, y, axis, out); + dev_ctx, x, y, axis, out); } template -void LessEqualRawKernel(const Context& ctx, +void LessEqualRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, DenseTensor* out) { CompareRawKernelImpl>( - ctx, x, y, axis, out); + dev_ctx, x, y, axis, out); } template -void GreaterThanRawKernel(const Context& ctx, +void GreaterThanRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, DenseTensor* out) { CompareRawKernelImpl>( - ctx, x, y, axis, out); + dev_ctx, x, y, axis, out); } template -void GreaterEqualRawKernel(const Context& ctx, +void GreaterEqualRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, DenseTensor* out) { CompareRawKernelImpl>( - ctx, x, y, axis, out); + dev_ctx, x, y, axis, out); } template -void EqualRawKernel(const Context& ctx, +void EqualRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, DenseTensor* out) { CompareRawKernelImpl>( - ctx, x, y, axis, out); + dev_ctx, x, y, axis, out); } template -void NotEqualRawKernel(const Context& ctx, +void NotEqualRawKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, int axis, DenseTensor* out) { CompareRawKernelImpl>( - ctx, x, y, axis, out); + dev_ctx, x, y, axis, out); } } // namespace phi diff --git a/paddle/phi/kernels/legacy/xpu/compare_kernel.cc b/paddle/phi/kernels/legacy/xpu/compare_kernel.cc index 3d461517ac6f22..4253b86915d45e 100644 --- a/paddle/phi/kernels/legacy/xpu/compare_kernel.cc +++ b/paddle/phi/kernels/legacy/xpu/compare_kernel.cc @@ -61,13 +61,13 @@ void XPUCompareRawKernelImpl( int axis, \ DenseTensor* out) { \ using XPUType = typename XPUTypeTrait::Type; \ - auto f = [](xpu::Context* ctx, \ + auto f = [](xpu::Context* xpu_ctx, \ const XPUType* x, \ const XPUType* y, \ bool* z, \ const std::vector& xshape, \ const std::vector& yshape) { \ - return functor(ctx, x, y, z, xshape, yshape); \ + return functor(xpu_ctx, x, y, z, xshape, yshape); \ }; \ XPUCompareRawKernelImpl(dev_ctx, x, y, out, f); \ } diff --git a/paddle/phi/kernels/legacy/xpu/elementwise_add_kernel.cc b/paddle/phi/kernels/legacy/xpu/elementwise_add_kernel.cc index 2ca79cd26160b3..b3a891f280f662 100644 --- a/paddle/phi/kernels/legacy/xpu/elementwise_add_kernel.cc +++ b/paddle/phi/kernels/legacy/xpu/elementwise_add_kernel.cc @@ -36,13 +36,13 @@ void AddRawKernel(const Context& dev_ctx, DenseTensor* out) { using XPUType = typename XPUTypeTrait::Type; - auto f = [](xpu::Context* ctx, + auto f = [](xpu::Context* xpu_ctx, const XPUType* x, const XPUType* y, XPUType* z, const std::vector& xshape, const std::vector& yshape) { - return xpu::broadcast_add(ctx, x, y, z, xshape, yshape); + return xpu::broadcast_add(xpu_ctx, x, y, z, xshape, yshape); }; XPUElementwise(dev_ctx, x, y, axis, out, f); diff --git a/paddle/phi/kernels/legacy/xpu/elementwise_divide_kernel.cc b/paddle/phi/kernels/legacy/xpu/elementwise_divide_kernel.cc index 3fed6a52fdff48..d87bf7362581b8 100644 --- a/paddle/phi/kernels/legacy/xpu/elementwise_divide_kernel.cc +++ b/paddle/phi/kernels/legacy/xpu/elementwise_divide_kernel.cc @@ -31,13 +31,13 @@ void DivideRawKernel(const Context& dev_ctx, int axis, DenseTensor* out) { using XPUType = typename XPUTypeTrait::Type; - auto f = [](xpu::Context* ctx, + auto f = [](xpu::Context* xpu_ctx, const XPUType* x, const XPUType* y, XPUType* z, const std::vector& xshape, const std::vector& yshape) { - return xpu::broadcast_div(ctx, x, y, z, xshape, yshape); + return xpu::broadcast_div(xpu_ctx, x, y, z, xshape, yshape); }; XPUElementwise(dev_ctx, x, y, axis, out, f); diff --git a/paddle/phi/kernels/legacy/xpu/elementwise_kernel.cc b/paddle/phi/kernels/legacy/xpu/elementwise_kernel.cc index 0825014319dfe9..ce9aa48b883b26 100644 --- a/paddle/phi/kernels/legacy/xpu/elementwise_kernel.cc +++ b/paddle/phi/kernels/legacy/xpu/elementwise_kernel.cc @@ -31,13 +31,13 @@ void MaximumRawKernel(const Context& dev_ctx, } using XPUType = typename XPUTypeTrait::Type; - auto f = [](xpu::Context* ctx, + auto f = [](xpu::Context* xpu_ctx, const XPUType* x, const XPUType* y, XPUType* z, const std::vector& xshape, const std::vector& yshape) { - return xpu::broadcast_max(ctx, x, y, z, xshape, yshape); + return xpu::broadcast_max(xpu_ctx, x, y, z, xshape, yshape); }; XPUElementwise(dev_ctx, x, y, axis, out, f); @@ -55,13 +55,13 @@ void MinimumRawKernel(const Context& dev_ctx, } using XPUType = typename XPUTypeTrait::Type; - auto f = [](xpu::Context* ctx, + auto f = [](xpu::Context* xpu_ctx, const XPUType* x, const XPUType* y, XPUType* z, const std::vector& xshape, const std::vector& yshape) { - return xpu::broadcast_min(ctx, x, y, z, xshape, yshape); + return xpu::broadcast_min(xpu_ctx, x, y, z, xshape, yshape); }; XPUElementwise(dev_ctx, x, y, axis, out, f); @@ -74,13 +74,13 @@ void RemainderRawKernel(const Context& dev_ctx, int axis, DenseTensor* out) { using XPUType = typename XPUTypeTrait::Type; - auto f = [](xpu::Context* ctx, + auto f = [](xpu::Context* xpu_ctx, const XPUType* x, const XPUType* y, XPUType* z, const std::vector& xshape, const std::vector& yshape) { - return xpu::broadcast_mod(ctx, x, y, z, xshape, yshape); + return xpu::broadcast_mod(xpu_ctx, x, y, z, xshape, yshape); }; XPUElementwise(dev_ctx, x, y, axis, out, f); @@ -93,13 +93,13 @@ void FloorDivideRawKernel(const Context& dev_ctx, int axis, DenseTensor* out) { using XPUType = typename XPUTypeTrait::Type; - auto f = [](xpu::Context* ctx, + auto f = [](xpu::Context* xpu_ctx, const XPUType* x, const XPUType* y, XPUType* z, const std::vector& xshape, const std::vector& yshape) { - return xpu::broadcast_floordiv(ctx, x, y, z, xshape, yshape); + return xpu::broadcast_floordiv(xpu_ctx, x, y, z, xshape, yshape); }; XPUElementwise(dev_ctx, x, y, axis, out, f); @@ -112,13 +112,13 @@ void ElementwisePowRawKernel(const Context& dev_ctx, int axis, DenseTensor* out) { using XPUType = typename XPUTypeTrait::Type; - auto f = [](xpu::Context* ctx, + auto f = [](xpu::Context* xpu_ctx, const XPUType* x, const XPUType* y, XPUType* z, const std::vector& xshape, const std::vector& yshape) { - return xpu::broadcast_pow(ctx, x, y, z, xshape, yshape); + return xpu::broadcast_pow(xpu_ctx, x, y, z, xshape, yshape); }; XPUElementwise(dev_ctx, x, y, axis, out, f); diff --git a/paddle/phi/kernels/legacy/xpu/elementwise_multiply_kernel.cc b/paddle/phi/kernels/legacy/xpu/elementwise_multiply_kernel.cc index d64499498ae8b7..e3cf1e7f377f20 100644 --- a/paddle/phi/kernels/legacy/xpu/elementwise_multiply_kernel.cc +++ b/paddle/phi/kernels/legacy/xpu/elementwise_multiply_kernel.cc @@ -31,13 +31,13 @@ void MultiplyRawKernel(const Context& dev_ctx, int axis, DenseTensor* out) { using XPUType = typename XPUTypeTrait::Type; - auto f = [](xpu::Context* ctx, + auto f = [](xpu::Context* xpu_ctx, const XPUType* x, const XPUType* y, XPUType* z, const std::vector& xshape, const std::vector& yshape) { - return xpu::broadcast_mul(ctx, x, y, z, xshape, yshape); + return xpu::broadcast_mul(xpu_ctx, x, y, z, xshape, yshape); }; XPUElementwise(dev_ctx, x, y, axis, out, f); diff --git a/paddle/phi/kernels/legacy/xpu/elementwise_subtract_kernel.cc b/paddle/phi/kernels/legacy/xpu/elementwise_subtract_kernel.cc index bf5ea1381965ff..231b84a8dd91a4 100644 --- a/paddle/phi/kernels/legacy/xpu/elementwise_subtract_kernel.cc +++ b/paddle/phi/kernels/legacy/xpu/elementwise_subtract_kernel.cc @@ -26,13 +26,13 @@ void SubtractRawKernel(const Context& dev_ctx, int axis, DenseTensor* out) { using XPUType = typename XPUTypeTrait::Type; - auto f = [](xpu::Context* ctx, + auto f = [](xpu::Context* xpu_ctx, const XPUType* x, const XPUType* y, XPUType* z, const std::vector& xshape, const std::vector& yshape) { - return xpu::broadcast_sub(ctx, x, y, z, xshape, yshape); + return xpu::broadcast_sub(xpu_ctx, x, y, z, xshape, yshape); }; phi::XPUElementwise(dev_ctx, x, y, axis, out, f); diff --git a/paddle/phi/kernels/legacy/xpu/one_hot_kernel.cc b/paddle/phi/kernels/legacy/xpu/one_hot_kernel.cc index 02edbd128430b5..76903f89660e77 100644 --- a/paddle/phi/kernels/legacy/xpu/one_hot_kernel.cc +++ b/paddle/phi/kernels/legacy/xpu/one_hot_kernel.cc @@ -25,21 +25,21 @@ struct OneHotV2OpFunctor { const DenseTensor* in_; DenseTensor* out_; int depth_; - const Context& ctx_; + const Context& dev_ctx_; OneHotV2OpFunctor(const DenseTensor* in, DenseTensor* out, int depth, - const Context& ctx) - : in_(in), out_(out), depth_(depth), ctx_(ctx) {} + const Context& dev_ctx) + : in_(in), out_(out), depth_(depth), dev_ctx_(dev_ctx) {} template void apply() const { auto* p_in_data = in_->data(); auto numel = in_->numel(); - auto* p_out_data = ctx_.template Alloc(out_); + auto* p_out_data = dev_ctx_.template Alloc(out_); int r = xpu::one_hot( - ctx_.x_context(), p_in_data, p_out_data, numel, depth_, 1.0, 0.0); + dev_ctx_.x_context(), p_in_data, p_out_data, numel, depth_, 1.0, 0.0); PADDLE_ENFORCE_XDNN_SUCCESS(r, "one_hot"); } }; diff --git a/paddle/phi/kernels/legacy/xpu/reduce_max_kernel.cc b/paddle/phi/kernels/legacy/xpu/reduce_max_kernel.cc index 4cb8d9d0439249..8c5881603e2e61 100644 --- a/paddle/phi/kernels/legacy/xpu/reduce_max_kernel.cc +++ b/paddle/phi/kernels/legacy/xpu/reduce_max_kernel.cc @@ -30,12 +30,12 @@ void MaxRawKernel(const Context& dev_ctx, DenseTensor* out) { reduce_all = recompute_reduce_all(x, dims, reduce_all); using XPUType = typename XPUTypeTrait::Type; - auto f = [](xpu::Context* ctx, + auto f = [](xpu::Context* xpu_ctx, const T* x, T* y, const std::vector& xdims, const std::vector& reduce_dims) { - return xpu::reduce_max(ctx, + return xpu::reduce_max(xpu_ctx, reinterpret_cast(x), reinterpret_cast(y), xdims, diff --git a/paddle/phi/kernels/min_max_with_index_kernel.h b/paddle/phi/kernels/min_max_with_index_kernel.h new file mode 100644 index 00000000000000..eca50fc3a752e8 --- /dev/null +++ b/paddle/phi/kernels/min_max_with_index_kernel.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MinWithIndexKernel(const Context& dev_ctx, + const DenseTensor& x, + const Scalar& dim, + bool keepdim, + bool flatten, + DenseTensor* val_out, + DenseTensor* ind_out); + +template +void MaxWithIndexKernel(const Context& dev_ctx, + const DenseTensor& x, + const Scalar& dim, + bool keepdim, + bool flatten, + DenseTensor* val_out, + DenseTensor* ind_out); + +} // namespace phi diff --git a/paddle/phi/kernels/onednn/gaussian_kernel.cc b/paddle/phi/kernels/onednn/gaussian_kernel.cc index 98197961a9df6b..61cdb580008611 100644 --- a/paddle/phi/kernels/onednn/gaussian_kernel.cc +++ b/paddle/phi/kernels/onednn/gaussian_kernel.cc @@ -20,7 +20,7 @@ namespace phi { template -void GaussianKernel(const Context& ctx, +void GaussianKernel(const Context& dev_ctx, const IntArray& shape, float mean, float std, @@ -33,10 +33,10 @@ void GaussianKernel(const Context& ctx, engine = std::make_shared(); engine->seed(seed); } else { - engine = ctx.GetGenerator()->GetCPUEngine(); + engine = dev_ctx.GetGenerator()->GetCPUEngine(); } - T* data = ctx.template Alloc(out); + T* data = dev_ctx.template Alloc(out); for (int64_t i = 0; i < out->numel(); ++i) { data[i] = dist(*engine); } diff --git a/paddle/phi/kernels/stride/index_select_kernel.cc b/paddle/phi/kernels/stride/index_select_kernel.cc index 0f3a8aae1e4e71..6db84f5c89180f 100644 --- a/paddle/phi/kernels/stride/index_select_kernel.cc +++ b/paddle/phi/kernels/stride/index_select_kernel.cc @@ -25,7 +25,7 @@ COMMON_DECLARE_bool(use_stride_kernel); namespace phi { template -void IndexSelectStridedKernel(const Context& ctx, +void IndexSelectStridedKernel(const Context& dev_ctx, const DenseTensor& x, int64_t index, int dim, diff --git a/paddle/phi/kernels/stride/slice_kernel.cc b/paddle/phi/kernels/stride/slice_kernel.cc index fe65a002b67df6..bff461867c37d9 100644 --- a/paddle/phi/kernels/stride/slice_kernel.cc +++ b/paddle/phi/kernels/stride/slice_kernel.cc @@ -26,7 +26,7 @@ COMMON_DECLARE_bool(use_stride_kernel); namespace phi { template -void SliceStridedKernel(const Context& ctx, +void SliceStridedKernel(const Context& dev_ctx, const DenseTensor& input, const std::vector& axes, const IntArray& starts_arr, diff --git a/paddle/phi/kernels/stride/transpose_kernel.cc b/paddle/phi/kernels/stride/transpose_kernel.cc index aaa4773f60808f..a5d627c4613267 100644 --- a/paddle/phi/kernels/stride/transpose_kernel.cc +++ b/paddle/phi/kernels/stride/transpose_kernel.cc @@ -22,7 +22,7 @@ COMMON_DECLARE_bool(use_stride_kernel); namespace phi { template -void TransposeStridedKernel(const Context& ctx, +void TransposeStridedKernel(const Context& dev_ctx, const DenseTensor& x, const std::vector& axis, DenseTensor* out) { diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml index 4760d51061c0f1..6a931d443605be 100644 --- a/paddle/phi/ops/yaml/backward.yaml +++ b/paddle/phi/ops/yaml/backward.yaml @@ -2277,6 +2277,16 @@ kernel : func : max_pool3d_with_index_grad +- backward_op : max_with_index_grad + forward : max_with_index (Tensor x, Scalar dim, bool keepdim, bool flatten) -> Tensor(values), Tensor(indices) + args : (Tensor x, Tensor values, Tensor indices, Tensor values_grad, Scalar dim, bool keepdim) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : max_with_index_grad + - backward_op : maxout_grad forward : maxout(Tensor x, int groups, int axis) -> Tensor(out) args : (Tensor x, Tensor out, Tensor out_grad, int groups, int axis) @@ -2340,6 +2350,16 @@ func : meshgrid_grad data_type : out_grad +- backward_op : min_with_index_grad + forward : min_with_index (Tensor x, Scalar dim, bool keepdim, bool flatten) -> Tensor(values), Tensor(indices) + args : (Tensor x, Tensor values, Tensor indices, Tensor values_grad, Scalar dim, bool keepdim) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : min_with_index_grad + - backward_op : mish_grad forward : mish (Tensor x, float lambda) -> Tensor(out) args : (Tensor x, Tensor out_grad, float lambda) diff --git a/paddle/phi/ops/yaml/fused_ops.yaml b/paddle/phi/ops/yaml/fused_ops.yaml index 291147c33367bf..991b1ab8c0ab6d 100644 --- a/paddle/phi/ops/yaml/fused_ops.yaml +++ b/paddle/phi/ops/yaml/fused_ops.yaml @@ -916,12 +916,13 @@ support_dygraph_mode : true - op: fused_transpose_split_quant - args: (Tensor x, IntArray tokens_per_expert, bool pow_2_scales=false) + args: (Tensor x, Tensor input_scales, IntArray tokens_per_expert, bool pow_2_scales=false) output: Tensor[](out){tokens_per_expert.size()}, Tensor[](scales){tokens_per_expert.size()} infer_meta: func: FusedTransposeSplitQuantInferMeta kernel: func: fused_transpose_split_quant + optional: input_scales support_dygraph_mode : true - op: fused_weighted_swiglu_act_quant diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml index 590055b43b9ba6..78f836e842cc3c 100644 --- a/paddle/phi/ops/yaml/ops.yaml +++ b/paddle/phi/ops/yaml/ops.yaml @@ -3553,6 +3553,17 @@ backward : max_pool3d_with_index_grad interfaces : paddle::dialect::InferSymbolicShapeInterface +- op : max_with_index + args : (Tensor x, Scalar(int64_t) dim, bool keepdim = false, bool flatten = false) + output : Tensor(values), Tensor(indices) + infer_meta : + func : MinMaxWithIndexInferMeta + kernel : + func : max_with_index + data_type : x + backward : max_with_index_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface, paddle::dialect::LayoutTransformationInterface + - op : maxout args : (Tensor x, int groups, int axis = 1) output : Tensor(out) @@ -3662,6 +3673,17 @@ backward : meshgrid_grad interfaces : paddle::dialect::InferSymbolicShapeInterface +- op : min_with_index + args : (Tensor x, Scalar(int64_t) dim, bool keepdim = false, bool flatten = false) + output : Tensor(values), Tensor(indices) + infer_meta : + func : MinMaxWithIndexInferMeta + kernel : + func : min_with_index + data_type : x + backward : min_with_index_grad + interfaces : paddle::dialect::InferSymbolicShapeInterface, paddle::dialect::LayoutTransformationInterface + - op : mish args : (Tensor x, float lambda) output : Tensor diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 4ebc15fdc9753c..53680e172adcd6 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -122,6 +122,7 @@ _pir_ops as _pir_ops, _typing as _typing, callbacks as callbacks, + compat as compat, fft as fft, hub as hub, linalg as linalg, diff --git a/python/paddle/compat.py b/python/paddle/compat.py new file mode 100644 index 00000000000000..1eef54a83fd2b8 --- /dev/null +++ b/python/paddle/compat.py @@ -0,0 +1,25 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .tensor.compat import ( + max, + min, + split, +) + +__all__ = [ + 'split', + 'min', + 'max', +] diff --git a/python/paddle/incubate/nn/functional/fp8.py b/python/paddle/incubate/nn/functional/fp8.py index 7c524b865ee96b..be61e7bdb72ae3 100644 --- a/python/paddle/incubate/nn/functional/fp8.py +++ b/python/paddle/incubate/nn/functional/fp8.py @@ -173,7 +173,9 @@ def fused_swiglu_weighted_bwd( return _C_ops.fused_swiglu_weighted_bwd(o1, do2_s, unzipped_probs) -def fused_transpose_split_quant(x, tokens_per_expert, pow_2_scales=False): +def fused_transpose_split_quant( + x, input_scales, tokens_per_expert, pow_2_scales=False +): """ Applies fused transpose, split, and quantization operation for Mixture of Experts (MoE) models. @@ -215,7 +217,7 @@ def fused_transpose_split_quant(x, tokens_per_expert, pow_2_scales=False): >>> x = paddle.randn([384, 512], dtype='bfloat16') >>> x = paddle.clip(x, min=-50, max=50) >>> tokens_per_expert = [128, 128, 128] - >>> outs, scales = F.fused_transpose_split_quant(x, tokens_per_expert, pow_2_scales=True) + >>> outs, scales = F.fused_transpose_split_quant(x,None, tokens_per_expert, pow_2_scales=True) >>> print(outs[0].shape) [512, 128] >>> print(scales[0].shape) @@ -228,7 +230,7 @@ def fused_transpose_split_quant(x, tokens_per_expert, pow_2_scales=False): if in_dynamic_or_pir_mode(): return _C_ops.fused_transpose_split_quant( - x, tokens_per_expert, pow_2_scales + x, input_scales, tokens_per_expert, pow_2_scales ) diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py index b3fe014b27a350..0d650d8fed519e 100644 --- a/python/paddle/nn/clip.py +++ b/python/paddle/nn/clip.py @@ -717,6 +717,7 @@ def _dygraph_clip(self, params_grads): sum_square_list = [] sum_square_list_fp16 = [] sum_square_list_fp32 = [] + flag_auto_hybrid_pp = True # Determine whether to use the new dynamic graph semi-automatic parallel pp framework if len(params_grads) > 0 and len(params_grads[0]) > 0: src_mesh = params_grads[0][0].process_mesh else: @@ -742,6 +743,7 @@ def _dygraph_clip(self, params_grads): # if the gradient mesh is not equal to src mesh # do reshard to get the result of squared_l2 from other pp stage mesh if src_mesh is not None and g.process_mesh != src_mesh: + flag_auto_hybrid_pp = False pp_mesh = get_complete_pp_mesh(g.process_mesh) if set(g.process_mesh.process_ids) < set(pp_mesh.process_ids): sum_square = dist.reshard( @@ -791,6 +793,37 @@ def async_add_n(var_list): global_norm_var = async_add_n(global_norm_var) + # NOTE(zhengtianyu): Fix grad_clip in auto_hybrid_pp mode. + # Reason: In auto_hybrid_pp mode, each rank only keeps local parameters and gradient information, + # so global_norm_var is in a partial state, leading to incorrect calculation. + # Reference dynamic manual-parallel: Each rank computes local global_norm_var, + # then performs pp group communication reduce(sum) to get correct global_norm_var. + # For complete alignment with old dygraph semi-auto parallel PP logic, + # refer to NOTE: align ClipGradByGlobalNorm in auto_parallel_align_mode + if flag_auto_hybrid_pp and src_mesh is not None: + g_mesh = dist.get_mesh() + if ( + g_mesh + and "pp" in g_mesh.dim_names + and g_mesh.get_dim_size("pp") > 1 + ): + # Get the pipeline parallelism subgroup for communication + pp_group = g_mesh.get_submesh_with_dim("pp").get_group("pp") + + # Perform all-reduce on the local tensor value across the PP group + global_norm_var_local = global_norm_var._local_value() + dist.all_reduce( + global_norm_var_local, + op=dist.ReduceOp.SUM, + group=pp_group, + ) + + global_norm_var = dist.shard_tensor( + global_norm_var_local, + global_norm_var.process_mesh, + global_norm_var.placements, + ) + if self.should_comm_on_shard_dim and hasattr(self, 'sharding_group'): paddle.distributed.all_reduce( global_norm_var._local_value(), group=self.sharding_group diff --git a/python/paddle/static/quantization/quant_int8_onednn_pass.py b/python/paddle/static/quantization/quant_int8_onednn_pass.py index 2387e8bd9b70f7..909a94427c9718 100644 --- a/python/paddle/static/quantization/quant_int8_onednn_pass.py +++ b/python/paddle/static/quantization/quant_int8_onednn_pass.py @@ -177,7 +177,7 @@ def _transform_to_conv_onednn(self, graph, op_node): conv_op_node.set_attr("Scale_weights", scale_w) conv_op_node.set_attr("Scale_in", scale_in) conv_op_node.set_attr("Scale_out", 1.0) - conv_op_node.set_attr("use_mkldnn", 1) + conv_op_node.set_attr("use_onednn", 1) conv_op_node.set_attr("force_fp32_output", 1) graph.link_to(input_var_node, conv_op_node) graph.link_to(weight_var_node, conv_op_node) @@ -223,7 +223,7 @@ def _transform_to_mul_onednn(self, graph, op_node): mul_op_node.set_attr("scale_y", scale_w) mul_op_node.set_attr("scale_x", scale_in) mul_op_node.set_attr("scale_out", 1.0) - mul_op_node.set_attr("use_mkldnn", 1) + mul_op_node.set_attr("use_onednn", 1) mul_op_node.set_attr("force_fp32_output", 1) graph.link_to(input_var_node, mul_op_node) graph.link_to(weight_var_node, mul_op_node) @@ -248,7 +248,7 @@ def _transform_to_quantize_onednn(self, graph, op_node): op_type='quantize', attrs={ 'data_format': 'ONEDNNLAYOUT', - 'use_mkldnn': 1, + 'use_onednn': 1, 'Scale': scale_in, 'is_negative_input': 1, }, diff --git a/python/paddle/tensor/compat.py b/python/paddle/tensor/compat.py new file mode 100644 index 00000000000000..e734023a11d96b --- /dev/null +++ b/python/paddle/tensor/compat.py @@ -0,0 +1,571 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, NamedTuple + +import paddle +from paddle import _C_ops + +from ..base.framework import Variable +from ..framework import ( + in_dynamic_mode, +) + +if TYPE_CHECKING: + from collections.abc import Sequence + + from paddle import Tensor + +from paddle.utils.decorator_utils import ForbidKeywordsDecorator + +__all__ = [] + + +@ForbidKeywordsDecorator( + illegal_keys=["x", "num_or_sections", "axis", "name"], + func_name="paddle.compat.split", + correct_name="paddle.split", +) +def split( + tensor: Tensor, split_size_or_sections: int | Sequence[int], dim: int = 0 +) -> tuple[Tensor, ...]: + """ + (PyTorch Compatible API) Split the input tensor into multiple sub-Tensors. + + Args: + tensor (Tensor): A N-D Tensor. The data type is bool, bfloat16, float16, float32, float64, uint8, int8, int32 or int64. + split_size_or_sections (int|list|tuple): + If split_size_or_sections is an integer type, then tensor will be split into equally sized chunks (if possible). + Last chunk will be smaller if the tensor size along the given dimension dim is not divisible by split_size. + If split_size_or_sections is a list, then tensor will be split into len(split_size_or_sections) chunks with sizes + in dim according to split_size_or_sections. Negative inputs are not allowed. For example: for a dim with 9 channels, + [2, 3, -1] will not be interpreted as [2, 3, 4], but will be rejected and an exception will be thrown. + dim (int|Tensor, optional): The dim along which to split, it can be a integer or a ``0-D Tensor`` + with shape [] and data type ``int32`` or ``int64``. + If :math::`dim < 0`, the dim to split along is :math:`rank(x) + dim`. Default is 0. + Returns: + tuple(Tensor), The tuple of segmented Tensors. + + Note: + This is a pytorch compatible API that follows the function signature and behavior of torch.split. + To use the original split of paddle, please consider `paddle.split` + + Examples: + .. code-block:: python + + >>> import paddle + + >>> # x is a Tensor of shape [3, 8, 5] + >>> x = paddle.rand([3, 8, 5]) + + >>> out0, out1, out2 = paddle.compat.split(x, split_size_or_sections=3, dim=1) + >>> print(out0.shape) + [3, 3, 5] + >>> print(out1.shape) + [3, 3, 5] + >>> print(out2.shape) + [3, 2, 5] + + >>> out0, out1, out2 = paddle.compat.split(x, split_size_or_sections=[1, 2, 5], dim=1) + >>> print(out0.shape) + [3, 1, 5] + >>> print(out1.shape) + [3, 2, 5] + >>> print(out2.shape) + [3, 5, 5] + + >>> # dim is negative, the real dim is (rank(x) + dim)=1 + >>> out0, out1, out2 = paddle.compat.split(x, split_size_or_sections=3, dim=-2) + >>> print(out0.shape) + [3, 3, 5] + >>> print(out1.shape) + [3, 3, 5] + >>> print(out2.shape) + [3, 2, 5] + """ + + def GetSplitSize(split_size, shape_on_dim): + remaining_num = shape_on_dim % split_size_or_sections + num_complete_section = shape_on_dim // split_size_or_sections + if remaining_num == 0: + return num_complete_section + else: + sections = [ + split_size_or_sections for _ in range(num_complete_section) + ] + sections.append(remaining_num) + return sections + + def GetShapeOnDimInRange(shape, dim: int) -> int: + shape_range = len(shape) + if isinstance(dim, int): + if dim < -shape_range or dim >= shape_range: + raise ValueError( + f"(InvalidArgument) The dim is expected to be in range of [-{shape_range}, {shape_range}), but got {dim}" + ) + return shape[dim] + + if isinstance(split_size_or_sections, (list, tuple)): + for i, section_size in enumerate(split_size_or_sections): + shape_val = 0 + if isinstance(section_size, Variable): + shape_val = int(section_size.item(0)) + else: + shape_val = section_size + if section_size < 0: + raise ValueError( + f"paddle.compat.split expects split_sizes have only non-negative entries, but got size = {section_size} on dim {i}" + ) + + if in_dynamic_mode(): + if isinstance(dim, Variable): + dim = dim.item(0) + assert dim + len(tensor.shape) >= 0, "(rank(x) + dim) must >= 0" + dim = (dim + len(tensor.shape)) if dim < 0 else dim + + if isinstance(split_size_or_sections, (list, tuple)): + if paddle.utils._contain_var(split_size_or_sections): + for index, item in enumerate(split_size_or_sections): + if isinstance(item, Variable): + split_size_or_sections[index] = split_size_or_sections[ + index + ].item() + elif not isinstance(split_size_or_sections, int): + raise TypeError( + "The type of 'split_size_or_sections' in split must be int, list or tuple in imperative mode, but " + f"received {type(split_size_or_sections)}." + ) + + if isinstance(split_size_or_sections, int): + # check whether shape is divisible + assert ( + split_size_or_sections > 0 + ), 'split_size_or_sections must be greater than 0.' + + split_size_or_sections = GetSplitSize( + split_size_or_sections, GetShapeOnDimInRange(tensor.shape, dim) + ) + + if isinstance(split_size_or_sections, list): + return tuple(_C_ops.split(tensor, split_size_or_sections, dim)) + else: + return tuple( + _C_ops.split_with_num(tensor, split_size_or_sections, dim) + ) + else: + return tuple(_C_ops.split(tensor, split_size_or_sections, dim)) + else: + if isinstance(dim, paddle.pir.Value): + raise TypeError( + "'dim' is not allowed to be a pir.Value in a static graph: " + "\npir.Value can not be used for indexing python lists/tuples." + ) + if isinstance(dim, int): + assert len(tensor.shape) + dim >= 0, "(rank(x) + dim) must >= 0" + dim = (len(tensor.shape) + dim) if dim < 0 else dim + + input_shape = tensor.shape + + if not isinstance(split_size_or_sections, (int, list, tuple)): + raise TypeError( + "The type of 'split_size_or_sections' in split must be int, list or tuple in imperative mode." + ) + if isinstance(split_size_or_sections, int): + assert ( + split_size_or_sections > 0 + ), 'split_size_or_sections must be greater than 0.' + + split_size_or_sections = GetSplitSize( + split_size_or_sections, GetShapeOnDimInRange(tensor.shape, dim) + ) + if isinstance(split_size_or_sections, list): + if paddle.utils._contain_var(split_size_or_sections): + split_size_or_sections = paddle.utils.get_int_tensor_list( + split_size_or_sections + ) + return tuple(_C_ops.split(tensor, split_size_or_sections, dim)) + else: + return tuple( + _C_ops.split_with_num(tensor, split_size_or_sections, dim) + ) + else: + if isinstance(dim, int) and input_shape[dim] > 0: + assert ( + len(split_size_or_sections) <= input_shape[dim] + ), 'len(split_size_or_sections) must not be more than input.shape[dim].' + if paddle.utils._contain_var(split_size_or_sections): + split_size_or_sections = paddle.utils.get_int_tensor_list( + split_size_or_sections + ) + return tuple(_C_ops.split(tensor, split_size_or_sections, dim)) + + +class MinMaxRetType(NamedTuple): + values: Tensor + indices: Tensor + + +def _min_max_param_checker(func_name: str, *args: Any, **kwargs: Any): + def invalid_arguments_exception(error_prefix=""): + type_strs = [type(v).__name__ for v in args] + type_strs.extend([f"{k}={type(v).__name__}" for k, v in kwargs.items()]) + signature = ", ".join(type_strs) + + error_msg = ( + f"Invalid arguments for `paddle.compat.{func_name}`:\n{error_prefix}" + f"Got: (paddle.Tensor input, {signature}), but expect one of:\n" + f" - (input: paddle.Tensor) for reduce_{func_name} on all dims.\n" + f" - (input: paddle.Tensor, other: paddle.Tensor) -> see paddle.{func_name}imum\n" + f" - (input: paddle.Tensor, int dim (cannot be None), bool keepdim = False)\n" + ) + return TypeError(error_msg) + + def try_get_keys(key): + res = None + try: + res = kwargs[key] + except KeyError: + raise invalid_arguments_exception() from None + return res + found_key = None + + dim_or_other = None + keepdim = False + + num_args = len(args) + total_arg_num = num_args + len(kwargs) + if total_arg_num > 2: + raise invalid_arguments_exception() + elif total_arg_num == 2: + if num_args == 2: + dim_or_other, keepdim = args + if dim_or_other is None or isinstance( + dim_or_other, (Variable, paddle.pir.Value) + ): + raise invalid_arguments_exception() + elif num_args == 1: + dim_or_other = args[0] + if dim_or_other is None or isinstance( + dim_or_other, (Variable, paddle.pir.Value) + ): + raise invalid_arguments_exception() + keepdim = try_get_keys("keepdim") + else: + dim_or_other = try_get_keys("dim") + keepdim = try_get_keys("keepdim") + elif total_arg_num == 1: + if num_args: + dim_or_other = args[0] + if dim_or_other is None: + raise invalid_arguments_exception() + else: + if "dim" in kwargs: + dim_or_other = kwargs["dim"] + elif "other" in kwargs: + dim_or_other = kwargs["other"] + if not isinstance(dim_or_other, (Variable, paddle.pir.Value)): + raise invalid_arguments_exception() + if dim_or_other is None: + raise invalid_arguments_exception() + + if ( + dim_or_other is not None + and not isinstance(dim_or_other, (Variable, paddle.pir.Value)) + and type(dim_or_other) is not int + ): + raise invalid_arguments_exception( + f"The second input must be int or Tensor or implicit None in compat.{func_name}, but received {type(dim_or_other)}.\n" + ) + + return dim_or_other, keepdim + + +def _min_max_tensor_allow_grad(input: Tensor): + """Prevent integral input tensor type to have `stop_gradient=False`""" + in_dtype = input.dtype + if ( + in_dtype == paddle.int32 + or in_dtype == paddle.int64 + or in_dtype == paddle.uint8 + or in_dtype == paddle.int16 + ): + if not input.stop_gradient: + raise TypeError( + f"Tensors with integral type: '{in_dtype}' should stop gradient." + ) + + +def _min_max_allow_cpu_composite(input: Tensor): + """paddle.min/argmin(max/argmax), paddle.take_along_axis reject the following types""" + in_dtype = input.dtype + if ( + in_dtype == paddle.float16 + or in_dtype == paddle.bfloat16 + or in_dtype == paddle.int16 + ): + raise TypeError( + f"Non-CUDA GPU placed Tensor does not have '{in_dtype}' op registered.\n" + "Paddle support following DataTypes: int32, int64, float64, float32, uint8" + ) + + +@ForbidKeywordsDecorator( + illegal_keys=['x', 'axis'], + func_name="paddle.compat.min", + correct_name='paddle.min', +) +def min(input: Tensor, *args: Any, **kwargs: Any) -> Tensor | MinMaxRetType: + """ + + Computes the minimum of tensor elements. There are mainly 3 cases (functionalities): + 1. paddle.compat.min(input: Tensor): reduce min over all dims, return a single value Tensor + 2. paddle.compat.min(input: Tensor, dim: int (cannot be None), keepdim=False): reduce min over the given dim, + returns a named tuple MinMaxRetType(values: Tensor, indices: Tensor) + 3. paddle.compat.min(input: Tensor, other: Tensor): see `paddle.minimum` + + Special warning: the gradient behavior is NOT well-documented by PyTorch, the actual behavior should be: + 1. Case 1: the same as `amin` + 2. Case 2: NOT evenly distributing the gradient for equal minimum elements! PyTorch actually only propagates to the elements with indices, + for example: Tensor([1, 1, 1]) -> min(..., dim=0) -> values=Tensor(0, ...), indices=Tensor(0), the gradient for input tensor won't be + Tensor([1/3, 1/3, 1/3]) as stated in their documentation, but will be Tensor([1, 0, 0]). This API implements a similar backward kernel. + 3. Case 3: the same as `minimum` + + Args: + input (Tensor): A tensor, the data type is bfloat16, float16, float32, float64, int32, int64 on GPU. + uint8, int32, int64, float32, float64 are allowed on CPU. + dim (int, optional): The dim along which the minimum is computed. + If this is not specified: see case 1, note that: `None` cannot be passed to this (TypeError will be thrown) + compute the minimum over all elements of `input` and return a Tensor with a single element, + otherwise must be in the range :math:`[-input.ndim, input.ndim)`. + If :math:`dim < 0`, the axis to reduce is :math:`input.ndim + dim`. + Warning: if `dim` is specified, execute static graph will throw exceptions + when not on a GPU device, since max_with_index is not implemented for non-GPU devices + keepdim (bool, optional): Whether to reserve the reduced dimension in the + output Tensor. The result tensor will have one fewer dimension + than the `input` unless :attr:`keepdim` is true, default + value is False. Note that if `dim` does not appear in neither (*args) or (**kwargs), this parameter cannot be passed alone + other (Tensor, optional): the other tensor to perform `paddle.minimum` with. This Tensor should + have the same or broadcast-able shape as the `input`. Note that (`dim` & `keepdim`) and `other` are mutually exclusive + meaning that trying to composite both will result in TypeError + + Returns: + - For case 1: a single value Tensor (0-dim) + - For case 2: a named tuple MinMaxRetType(values: Tensor, indices: Tensor), `values` has the same data type as the `input`, + while indices is always an int64 Tensor, with exactly the same shape as `values`. + MinMaxRetType can be used (indexed, packed, unpacked) in the same way as a regular tuple + - For case 3: see `paddle.minimum` + + + Examples: + .. code-block:: python + + >>> import paddle + + >>> # data_x is a Tensor with shape [2, 4] + >>> # the axis is a int element + >>> x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9], + ... [0.1, 0.2, 0.6, 0.7]], + ... dtype='float64', stop_gradient=False) + >>> # Case 1: reduce over all dims + >>> result1 = paddle.compat.min(x) + >>> result1 + Tensor(shape=[], dtype=float64, place=Place(gpu:0), stop_gradient=False, + 0.10000000) + + >>> # Case 2: reduce over specified dim + >>> x.clear_grad() + >>> result2 = paddle.compat.min(x, dim=1) + >>> result2 + MinMaxRetType(values=Tensor(shape=[2], dtype=float64, place=Place(gpu:0), stop_gradient=False, + [0.20000000, 0.10000000]), indices=Tensor(shape=[2], dtype=int64, place=Place(gpu:0), stop_gradient=True, + [0, 0])) + >>> result2[0].backward() + >>> x.grad + Tensor(shape=[2, 4], dtype=float64, place=Place(gpu:0), stop_gradient=False, + [[1., 0., 0., 0.], + [1., 0., 0., 0.]]) + + >>> # Case 3: equivalent to `paddle.minimum` + >>> x.clear_grad() + >>> y = paddle.to_tensor([[0.5, 0.4, 0.1, 0.2], + ... [0.3, 0.1, 0.6, 0.7]], + ... dtype='float64', stop_gradient=False) + >>> result3 = paddle.compat.min(x, y) + >>> result3 + Tensor(shape=[2, 4], dtype=float64, place=Place(gpu:0), stop_gradient=False, + [[0.20000000, 0.30000000, 0.10000000, 0.20000000], + [0.10000000, 0.10000000, 0.60000000, 0.70000000]]) + """ + if not isinstance(input, paddle.pir.Value) and not isinstance( + input, paddle.Tensor + ): + raise TypeError( + f"input should be a tensor, but got an instance with type '{type(input).__name__}'" + ) + _min_max_tensor_allow_grad(input) + + dim_or_other, keepdim = _min_max_param_checker("min", *args, **kwargs) + + if dim_or_other is None: + if input.numel() == 0: + raise ValueError( + "Reduce max cannot apply on empty tensor (numel == 0)" + ) + return paddle.amin(input) + elif isinstance(dim_or_other, int): + if in_dynamic_mode() and not input.place.is_gpu_place(): + _min_max_allow_cpu_composite(input) + # CPUPlace and other placements are implemented by composition + indices = paddle.argmin(input, axis=dim_or_other, keepdim=True) + values = paddle.take_along_axis(input, indices, axis=dim_or_other) + if keepdim: + return MinMaxRetType(values=values, indices=indices) + return MinMaxRetType( + values=values.squeeze_(axis=dim_or_other), + indices=indices.squeeze_(axis=dim_or_other), + ) + else: + vals, inds = _C_ops.min_with_index( + input, dim_or_other, keepdim, False + ) + inds.stop_gradient = True + return MinMaxRetType(values=vals, indices=inds) + else: + return _C_ops.minimum(input, dim_or_other) + + +@ForbidKeywordsDecorator( + illegal_keys=['x', 'axis'], + func_name="paddle.compat.max", + correct_name='paddle.max', +) +def max(input: Tensor, *args: Any, **kwargs: Any) -> Tensor | MinMaxRetType: + """ + + Computes the maximum of tensor elements. There are mainly 3 cases (functionalities): + 1. paddle.compat.max(input: Tensor): reduce max over all dims, return a single value Tensor + 2. paddle.compat.max(input: Tensor, dim: int (cannot be None), keepdim=False): reduce max over the given dim, + returns a named tuple MinMaxRetType(values: Tensor, indices: Tensor) + 3. paddle.compat.max(input: Tensor, other: Tensor): see `paddle.maximum` + + Special warning: the gradient behavior is NOT well-documented by PyTorch, the actual behavior should be: + 1. Case 1: the same as `amax` + 2. Case 2: NOT evenly distributing the gradient for equal maximum elements! PyTorch actually only propagates to the elements with indices, + for example: Tensor([1, 1, 1]) -> max(..., dim=0) -> values=Tensor(0, ...), indices=Tensor(0), the gradient for input tensor won't be + Tensor([1/3, 1/3, 1/3]) as stated in their documentation, but will be Tensor([1, 0, 0]). This API implements a similar backward kernel. + 3. Case 3: the same as `maximum` + + Args: + input (Tensor): A tensor, the data type is bfloat16, float16, float32, float64, int32, int64 on GPU. + uint8, int32, int64, float32, float64 are allowed on CPU. + dim (int, optional): The dim along which the maximum is computed. + If this is not specified: see case 1, note that: `None` cannot be passed to this (TypeError will be thrown) + compute the maximum over all elements of `input` and return a Tensor with a single element, + otherwise must be in the range :math:`[-input.ndim, input.ndim)`. + If :math:`dim < 0`, the axis to reduce is :math:`input.ndim + dim`. + Warning: if `dim` is specified, execute static graph will throw exceptions + when not on a GPU device, since max_with_index is not implemented for non-GPU devices + keepdim (bool, optional): Whether to reserve the reduced dimension in the + output Tensor. The result tensor will have one fewer dimension + than the `input` unless :attr:`keepdim` is true, default + value is False. Note that if `dim` does not appear in neither (*args) or (**kwargs), this parameter cannot be passed alone + other (Tensor, optional): the other tensor to perform `paddle.maximum` with. This Tensor should + have the same or broadcast-able shape as the `input`. Note that (`dim` & `keepdim`) and `other` are mutually exclusive + meaning that trying to composite both will result in TypeError + + Returns: + - For case 1: a single value Tensor (0-dim) + - For case 2: a named tuple MinMaxRetType(values: Tensor, indices: Tensor), `values` has the same data type as the `input`, + while indices is always an int64 Tensor, with exactly the same shape as `values`. + MinMaxRetType can be used (indexed, packed, unpacked) in the same way as a regular tuple + - For case 3: see `paddle.maximum` + + + Examples: + .. code-block:: python + + >>> import paddle + + >>> # data_x is a Tensor with shape [2, 4] + >>> # the axis is a int element + >>> x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9], + ... [0.1, 0.2, 0.6, 0.7]], + ... dtype='float64', stop_gradient=False) + >>> # Case 1: reduce over all dims + >>> result1 = paddle.compat.max(x) + >>> result1 + Tensor(shape=[], dtype=float64, place=Place(gpu:0), stop_gradient=False, + 0.90000000) + + >>> # Case 2: reduce over specified dim + >>> x.clear_grad() + >>> result2 = paddle.compat.max(x, dim=1) + >>> result2 + MinMaxRetType(values=Tensor(shape=[2], dtype=float64, place=Place(gpu:0), stop_gradient=False, + [0.90000000, 0.70000000]), indices=Tensor(shape=[2], dtype=int64, place=Place(gpu:0), stop_gradient=True, + [3, 3])) + >>> result2[0].backward() + >>> x.grad + Tensor(shape=[2, 4], dtype=float64, place=Place(gpu:0), stop_gradient=False, + [[0., 0., 0., 1.], + [0., 0., 0., 1.]]) + + >>> # Case 3: equivalent to `paddle.maximum` + >>> x.clear_grad() + >>> y = paddle.to_tensor([[0.5, 0.4, 0.1, 0.2], + ... [0.3, 0.1, 0.6, 0.7]], + ... dtype='float64', stop_gradient=False) + >>> result3 = paddle.compat.max(x, y) + >>> result3 + Tensor(shape=[2, 4], dtype=float64, place=Place(gpu:0), stop_gradient=False, + [[0.50000000, 0.40000000, 0.50000000, 0.90000000], + [0.30000000, 0.20000000, 0.60000000, 0.70000000]]) + """ + if not isinstance(input, paddle.pir.Value) and not isinstance( + input, paddle.Tensor + ): + raise TypeError( + f"input should be a tensor, but got an instance with type '{type(input).__name__}'" + ) + _min_max_tensor_allow_grad(input) + + dim_or_other, keepdim = _min_max_param_checker("max", *args, **kwargs) + + if dim_or_other is None: + if input.numel() == 0: + raise ValueError( + "Reduce max cannot apply on empty tensor (numel == 0)" + ) + return paddle.amax(input) + elif isinstance(dim_or_other, int): + if in_dynamic_mode() and not input.place.is_gpu_place(): + _min_max_allow_cpu_composite(input) + indices = paddle.argmax(input, axis=dim_or_other, keepdim=True) + values = paddle.take_along_axis(input, indices, axis=dim_or_other) + if keepdim: + return MinMaxRetType(values=values, indices=indices) + return MinMaxRetType( + values=values.squeeze_(axis=dim_or_other), + indices=indices.squeeze_(axis=dim_or_other), + ) + else: + vals, inds = _C_ops.max_with_index( + input, dim_or_other, keepdim, False + ) + inds.stop_gradient = True + return MinMaxRetType(values=vals, indices=inds) + else: + return _C_ops.maximum(input, dim_or_other) diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index cb9b300b6d624f..55432ea9adcbaa 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -24,7 +24,7 @@ import paddle from paddle import _C_ops -from paddle.utils.decorator_utils import ParamAliasDecorator +from paddle.utils.decorator_utils import ParamAliasDecorator, SizeArgsDecorator from paddle.utils.inplace_utils import inplace_apis_in_dygraph_only from ..base.data_feeder import ( @@ -1241,6 +1241,7 @@ def fill_constant( return out +@SizeArgsDecorator() def ones( shape: ShapeLike, dtype: DTypeLike | None = None, name: str | None = None ) -> paddle.Tensor: @@ -3032,7 +3033,7 @@ def _memcpy(input, place=None, output=None) -> paddle.Tensor: def complex( - real: paddle.Tensor, imag: paddle.Tensor, name: str | None = None + real: paddle.Tensor, imag: paddle.Tensor, out=None, name: str | None = None ) -> paddle.Tensor: """Return a complex tensor given the real and image component. @@ -3040,6 +3041,7 @@ def complex( real (Tensor): The real component. The data type should be 'float32' or 'float64'. imag (Tensor): The image component. The data type should be the same as ``real``. name(str|None, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None. + out (Tensor|None, optional): The output tensor. Default: None. Returns: Tensor, The output tensor. The data type is 'complex64' or 'complex128', with the same precision as ``real`` and ``imag``. @@ -3062,7 +3064,7 @@ def complex( [(1+0j), (1+1j), (1+2j)]]) """ if in_dynamic_or_pir_mode(): - return _C_ops.complex(real, imag) + return _C_ops.complex(real, imag, out=out) else: check_variable_and_dtype( real, 'real', ['float32', 'float64'], 'complex' diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 857554b5dd1f2a..2014603dff6ca6 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -58,6 +58,8 @@ TensorOrTensors, ) +from paddle.utils.decorator_utils import ForbidKeywordsDecorator + __all__ = [] @@ -2723,6 +2725,11 @@ def row_stack(x: Sequence[Tensor], name: str | None = None) -> Tensor: return paddle.vstack(x, name=name) +@ForbidKeywordsDecorator( + illegal_keys=["tensor", "split_size_or_sections", "dim"], + func_name="paddle.split", + correct_name="paddle.compat.split", +) def split( x: Tensor, num_or_sections: int | Sequence[int], diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 4c065b4ab43c2b..868c9eb1c10173 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -100,6 +100,8 @@ from paddle import Tensor from paddle._typing import DTypeLike +from paddle.utils.decorator_utils import ForbidKeywordsDecorator + __all__ = [] _supported_int_dtype_ = [ @@ -3131,6 +3133,11 @@ def _check_input(x): return out +@ForbidKeywordsDecorator( + illegal_keys=["input", "dim", "other"], + func_name="paddle.max", + correct_name="paddle.compat.max", +) def max( x: Tensor, axis: int | Sequence[int] | None = None, @@ -3290,6 +3297,11 @@ def max( return out +@ForbidKeywordsDecorator( + illegal_keys=["input", "dim", "other"], + func_name="paddle.min", + correct_name="paddle.compat.min", +) def min( x: Tensor, axis: int | Sequence[int] | None = None, diff --git a/python/paddle/utils/decorator_utils.py b/python/paddle/utils/decorator_utils.py index 79ec73937ec8c1..97d1f4da603517 100644 --- a/python/paddle/utils/decorator_utils.py +++ b/python/paddle/utils/decorator_utils.py @@ -89,3 +89,56 @@ def process( f"Cannot specify both '{original}' and its alias '{alias}'" ) return args, processed_kwargs + + +# *size => shape decorator +class SizeArgsDecorator(DecoratorBase): + """ + Usage Example: + + paddle.ones(1, dtype=paddle.float32) + paddle.ones(1, 2, 3, dtype=paddle.float32) + paddle.ones([1, 2, 3], dtype=paddle.float32) + paddle.ones(size=[1, 2, 3], dtype=paddle.float32) + + paddle.ones([1, 2, 3], paddle.float32) + paddle.ones(shape=[1, 2, 3], dtype=paddle.float32) + """ + + def process( + self, args: tuple[Any, ...], kwargs: dict[str, Any] + ) -> tuple[tuple[Any, ...], dict[str, Any]]: + if 'size' in kwargs: + kwargs['shape'] = kwargs.pop('size') + elif len(args) >= 1 and isinstance(args[0], int): + kwargs['shape'] = list(args) + args = () + + return args, kwargs + + +class ForbidKeywordsDecorator(DecoratorBase): + """A decorator that hints users to use the correct `compat` functions, when erroneous keyword arguments are detected""" + + def __init__( + self, illegal_keys: list[str], func_name: str, correct_name: str + ) -> None: + super().__init__() + self.illegal_keys = illegal_keys + self.func_name = func_name + self.correct_name = correct_name + + def process( + self, args: tuple[Any, ...], kwargs: dict[str, Any] + ) -> tuple[tuple[Any, ...], dict[str, Any]]: + found_keys = [key for key in self.illegal_keys if key in kwargs] + + if found_keys: + keys_str = ", ".join(f"'{key}'" for key in found_keys) + plural = "s" if len(found_keys) > 1 else "" + + raise TypeError( + f"{self.func_name}() received unexpected keyword argument{plural} {keys_str}. " + f"\nDid you mean to use {self.correct_name}() instead?" + ) + return args, kwargs diff --git a/test/auto_parallel/PP_Schedules_demo.py b/test/auto_parallel/PP_Schedules_demo.py index 6ac055410fbf0a..be8963356d0661 100644 --- a/test/auto_parallel/PP_Schedules_demo.py +++ b/test/auto_parallel/PP_Schedules_demo.py @@ -414,6 +414,67 @@ def test_dp_pp(self): opt.clear_grad() return losses_by_step, all_losses_in_one_step_md5sum + def test_pp_model_with_ClipGradByGlobalNorm(self): + """Test pipeline parallel model with ClipGradByGlobalNorm using PPMyModel as the baseline""" + fix_seeds() + pp_model = PPMyModel() + opt = paddle.optimizer.AdamW( + learning_rate=0.001, + parameters=pp_model.parameters(), + grad_clip=paddle.nn.ClipGradByGlobalNorm(1.0), + ) + loss_fn = nn.MSELoss() + dataset = RandomDataset(image_size=8, output_size=8, num_samples=8) + loader = DataLoader(dataset, batch_size=1) + pp_losses_step = [] + num_iterations = 20 + + for iter_idx in range(num_iterations): + pp_losses_micro_batch = [] + for i, (data, label) in enumerate(loader): + output = pp_model(data) + loss = loss_fn(output, label) + pp_losses_micro_batch.append(loss.item()) + loss.backward() + pp_losses_step.append( + np.array(pp_losses_micro_batch, dtype=np.float32).mean() + ) + opt.step() + opt.clear_grad() + return pp_losses_step + + def test_ScheduleFThenB_with_ClipGradByGlobalNorm(self): + fix_seeds() + self.model = PPMyModel_SingleStage() + self.micro_batches = 8 + self.stage = PipelineStage(self.model, self.rank, 4, group=self.group) + self.stage.has_backward = True + loss_fn_ = nn.MSELoss() + schedule = ScheduleFThenB( + self.stage, self.micro_batches, loss_fn=loss_fn_ + ) + opt = paddle.optimizer.AdamW( + learning_rate=0.001, + parameters=self.model.parameters(), + grad_clip=paddle.nn.ClipGradByGlobalNorm(1.0), + ) + dataset = RandomDataset(image_size=8, output_size=8, num_samples=8) + loader = DataLoader(dataset, batch_size=8) + losses_by_step = [] + num_iterations = 20 + + for iter_idx in range(num_iterations): + losses_by_micro_batch = [] + for i, (data, label) in enumerate(loader): + schedule.step(data, target=label, losses=losses_by_micro_batch) + if self.rank == 3: + losses_by_step.append( + np.array(losses_by_micro_batch, dtype=np.float32).mean() + ) + opt.step() + opt.clear_grad() + return losses_by_step + def test_dp_pp_align_mode(self): fix_seeds() paddle.set_flags({'FLAGS_enable_auto_parallel_align_mode': True}) @@ -490,6 +551,12 @@ def run_test(self): scheduleFThenB_losses = self.test_ScheduleFThenB() schedule1f1b_losses = self.test_Schedule1F1B() schedulevpp_losses = self.test_ScheduleVPP() + pp_model_with_ClipGradByGlobalNorm_losses = ( + self.test_pp_model_with_ClipGradByGlobalNorm() + ) + scheduleFThenB_with_ClipGradByGlobalNorm_losses = ( + self.test_ScheduleFThenB_with_ClipGradByGlobalNorm() + ) dp_pp_losses, dp_pp_losses_md5sum = self.test_dp_pp() dp_pp_align_mode_losses, dp_pp_align_mode_losses_md5sum = ( self.test_dp_pp_align_mode() @@ -520,6 +587,12 @@ def run_test(self): rtol=1e-5, ) + np.testing.assert_allclose( + pp_model_with_ClipGradByGlobalNorm_losses, + scheduleFThenB_with_ClipGradByGlobalNorm_losses, + rtol=1e-5, + ) + np.testing.assert_allclose( dp_pp_align_mode_losses, dp_pp_losses, diff --git a/test/cpp/cinn/common/integer_set_test.cc b/test/cpp/cinn/common/integer_set_test.cc index 6d57f2dd0ed257..3f7afd4bcae50d 100644 --- a/test/cpp/cinn/common/integer_set_test.cc +++ b/test/cpp/cinn/common/integer_set_test.cc @@ -24,11 +24,13 @@ namespace common { class TestSymbolicExprAnalyzer : public ::testing::Test { public: void SetUp() override { - i = ir::Var(ir::Expr(0), ir::Expr(7), "i"); - j = ir::Var(ir::Expr(0), ir::Expr(15), "j"); + // Var is [lower_bound, upper_bound) + i = ir::Var(ir::Expr(0), ir::Expr(7), "i"); // i ∈ [0, 7) + j = ir::Var(ir::Expr(0), ir::Expr(15), "j"); // j ∈ [0, 15) + // CasInterval is [lower_bound, upper_bound] var_intervals = { - {"i", CasInterval(i->lower_bound, i->upper_bound)}, - {"j", CasInterval(j->lower_bound, j->upper_bound)}, + {"i", CasInterval(i->lower_bound, i->upper_bound - 1)}, // i ∈ [0, 6] + {"j", CasInterval(j->lower_bound, j->upper_bound - 1)}, // j ∈ [0, 14] }; } @@ -41,35 +43,35 @@ class TestSymbolicExprAnalyzer : public ::testing::Test { TEST_F(TestSymbolicExprAnalyzer, bound) { ir::Expr e1 = i + j; EXPECT_EQ(analyzer.LowerBound(e1), ir::Expr(0)); - EXPECT_EQ(analyzer.UpperBound(e1), ir::Expr(22)); + EXPECT_EQ(analyzer.UpperBound(e1), ir::Expr(20)); // 6 + 14 = 20 ir::Expr e2 = 16 * i + j; EXPECT_EQ(analyzer.LowerBound(e2), ir::Expr(0)); - EXPECT_EQ(analyzer.UpperBound(e2), ir::Expr(127)); + EXPECT_EQ(analyzer.UpperBound(e2), ir::Expr(110)); // 16 * 6 + 14 = 110 ir::Expr e3 = 16 * i + j + 1; EXPECT_EQ(analyzer.LowerBound(e3), ir::Expr(1)); - EXPECT_EQ(analyzer.UpperBound(e3), ir::Expr(128)); + EXPECT_EQ(analyzer.UpperBound(e3), ir::Expr(111)); // 16 * 6 + 15 = 111 ir::Expr e4 = (16 * i + j) / 16; EXPECT_EQ(analyzer.LowerBound(e4), ir::Expr(0)); - EXPECT_EQ(analyzer.UpperBound(e4), ir::Expr(7)); + EXPECT_EQ(analyzer.UpperBound(e4), ir::Expr(6)); // 110 / 16 = 6 ir::Expr e5 = (16 * i + j) % 16; EXPECT_EQ(analyzer.LowerBound(e5), ir::Expr(0)); - EXPECT_EQ(analyzer.UpperBound(e5), ir::Expr(15)); + EXPECT_EQ(analyzer.UpperBound(e5), ir::Expr(14)); // 110 % 16 ir::Expr e6 = i - j; - EXPECT_EQ(analyzer.LowerBound(e6), ir::Expr(-15)); - EXPECT_EQ(analyzer.UpperBound(e6), ir::Expr(7)); + EXPECT_EQ(analyzer.LowerBound(e6), ir::Expr(-14)); // 0 - 14 + EXPECT_EQ(analyzer.UpperBound(e6), ir::Expr(6)); // 6 - 0 ir::Expr e7 = 0 - i - j; - EXPECT_EQ(analyzer.LowerBound(e7), ir::Expr(-22)); - EXPECT_EQ(analyzer.UpperBound(e7), ir::Expr(0)); + EXPECT_EQ(analyzer.LowerBound(e7), ir::Expr(-20)); // 0 - 6 - 14 + EXPECT_EQ(analyzer.UpperBound(e7), ir::Expr(0)); // 0 - 0 - 0 ir::Expr e8 = -1 * i - j; - EXPECT_EQ(analyzer.LowerBound(e8), ir::Expr(-22)); - EXPECT_EQ(analyzer.UpperBound(e8), ir::Expr(0)); + EXPECT_EQ(analyzer.LowerBound(e8), ir::Expr(-20)); // -1 * 6 - 14 + EXPECT_EQ(analyzer.UpperBound(e8), ir::Expr(0)); // -1 * 0 - 0 } TEST_F(TestSymbolicExprAnalyzer, compare) { @@ -142,9 +144,9 @@ TEST_F(TestSymbolicExprAnalyzer, Divisible) { auto S = ir::Var(ir::Expr(16), ir::Expr(256), "S"); cas_intervals_t divisible_var_intervals = { - {"x", CasInterval(x->lower_bound, x->upper_bound)}, - {"y", CasInterval(y->lower_bound, y->upper_bound)}, - {"S", CasInterval(S->lower_bound, S->upper_bound)}, + {"x", CasInterval(x->lower_bound, x->upper_bound - ir::Expr(1))}, + {"y", CasInterval(y->lower_bound, y->upper_bound - ir::Expr(1))}, + {"S", CasInterval(S->lower_bound, S->upper_bound - ir::Expr(1))}, }; SymbolicExprAnalyzer divisible_analyzer{divisible_var_intervals}; @@ -323,11 +325,11 @@ TEST(SingleIntervalIntSet, case_1) { } TEST(SingleIntervalIntSet, case_2) { - ir::Var S = ir::Var(ir::Expr(0), ir::Expr(0), "S"); + ir::Var S = ir::Var(ir::Expr(0), ir::Expr(1), "S"); // S ∈ [0, 1) - SingleIntervalIntSet set_0{S, S + Expr(1)}; - SingleIntervalIntSet set_1{Expr(0), Expr(1)}; - SingleIntervalIntSet set_2{Expr(0), Expr(2)}; + SingleIntervalIntSet set_0{S, S + Expr(1)}; // [0, 1] + SingleIntervalIntSet set_1{Expr(0), Expr(1)}; // [0, 1] + SingleIntervalIntSet set_2{Expr(0), Expr(2)}; // [0, 2] EXPECT_TRUE(ProveEQ(set_0, set_1).value()); EXPECT_FALSE(ProveEQ(set_0, set_2).value()); diff --git a/test/cpp/eager/performance_tests/benchmark_utils.cc b/test/cpp/eager/performance_tests/benchmark_utils.cc index 7b95d911bc5345..23218075517c4c 100644 --- a/test/cpp/eager/performance_tests/benchmark_utils.cc +++ b/test/cpp/eager/performance_tests/benchmark_utils.cc @@ -228,7 +228,7 @@ void benchmark_fluid_scale(const std::shared_ptr& X, imperative::Tracer tracer; framework::AttributeMap attrs; - attrs["use_mkldnn"] = false; + attrs["use_onednn"] = false; attrs["scale"] = 2; attrs["bias"] = 3; attrs["bias_after_scale"] = true; diff --git a/test/cpp/fluid/elementwise/test_elementwise_add_grad_grad.cc b/test/cpp/fluid/elementwise/test_elementwise_add_grad_grad.cc index 6186cfa2c9756f..ec00557d6a0dd5 100644 --- a/test/cpp/fluid/elementwise/test_elementwise_add_grad_grad.cc +++ b/test/cpp/fluid/elementwise/test_elementwise_add_grad_grad.cc @@ -59,7 +59,7 @@ class TestElementwiseAddGradGradWithoutDDX this->op_type_, {{"Y", {"Y"}}, {"DOut", {"DOut"}}, {"DDY", {"DDY"}}}, {{"DDOut", {"DDOut"}}}, - {{"use_mkldnn", false}, {"axis", 0}}); + {{"use_onednn", false}, {"axis", 0}}); return op; } }; diff --git a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc index 8f1ed87888ba44..f4ecb943a8dd9c 100644 --- a/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc +++ b/test/cpp/fluid/elementwise/test_elementwise_div_grad_grad.cc @@ -87,7 +87,7 @@ class TestElementwiseDivGradGradWithDout : public TestElementwiseOpGradGrad { {"DDY", {"DDY"}}, {"DX", {"DX"}}}, {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}, {"DOut", {"DOut"}}}, - {{"use_mkldnn", false}, {"axis", 0}}); + {{"use_onednn", false}, {"axis", 0}}); return op; } }; diff --git a/test/cpp/fluid/mkldnn/test_conv_onednn_nhwc.cc b/test/cpp/fluid/mkldnn/test_conv_onednn_nhwc.cc index 28028858c3bac0..49071d5938a744 100644 --- a/test/cpp/fluid/mkldnn/test_conv_onednn_nhwc.cc +++ b/test/cpp/fluid/mkldnn/test_conv_onednn_nhwc.cc @@ -66,7 +66,7 @@ TEST(test_conv2d_output, fp32) { conv2d_op.SetAttr("paddings", paddings); conv2d_op.SetAttr("dilations", dilations); conv2d_op.SetAttr("groups", groups); - conv2d_op.SetAttr("use_mkldnn", true); + conv2d_op.SetAttr("use_onednn", true); auto op = paddle::framework::OpRegistry::CreateOp(conv2d_op); @@ -95,7 +95,7 @@ TEST(test_conv2d_output, int8) { conv2d_op.SetAttr("paddings", paddings); conv2d_op.SetAttr("dilations", dilations); conv2d_op.SetAttr("groups", groups); - conv2d_op.SetAttr("use_mkldnn", true); + conv2d_op.SetAttr("use_onednn", true); conv2d_op.SetAttr("mkldnn_data_type", std::string("int8")); conv2d_op.SetAttr("force_fp32_output", false); @@ -126,7 +126,7 @@ TEST(test_conv2d_output, ic1) { conv2d_op.SetAttr("paddings", paddings); conv2d_op.SetAttr("dilations", dilations); conv2d_op.SetAttr("groups", groups); - conv2d_op.SetAttr("use_mkldnn", true); + conv2d_op.SetAttr("use_onednn", true); auto op = paddle::framework::OpRegistry::CreateOp(conv2d_op); @@ -156,7 +156,7 @@ TEST(test_conv2d_output, ic2) { conv2d_op.SetAttr("paddings", paddings); conv2d_op.SetAttr("dilations", dilations); conv2d_op.SetAttr("groups", groups); - conv2d_op.SetAttr("use_mkldnn", true); + conv2d_op.SetAttr("use_onednn", true); auto op = paddle::framework::OpRegistry::CreateOp(conv2d_op); @@ -186,7 +186,7 @@ TEST(test_conv2d_output, ic4) { conv2d_op.SetAttr("paddings", paddings); conv2d_op.SetAttr("dilations", dilations); conv2d_op.SetAttr("groups", groups); - conv2d_op.SetAttr("use_mkldnn", true); + conv2d_op.SetAttr("use_onednn", true); auto op = paddle::framework::OpRegistry::CreateOp(conv2d_op); diff --git a/test/cpp/fluid/mkldnn/test_onednn_caching.cc b/test/cpp/fluid/mkldnn/test_onednn_caching.cc index 694d9aeb6e3bc7..d87e1c4145f5b2 100644 --- a/test/cpp/fluid/mkldnn/test_onednn_caching.cc +++ b/test/cpp/fluid/mkldnn/test_onednn_caching.cc @@ -115,12 +115,12 @@ void RunOperator(const phi::Place &place, {{first_input_var_name, {first_input}}, {second_input_var_name, {"x1"}}}, {{output_var_name, {output_name}}}, - {{"use_mkldnn", {true}}}) + {{"use_onednn", {true}}}) : framework::OpRegistry::CreateOp( op_type, {{first_input_var_name, {first_input}}}, {{output_var_name, {output_name}}}, - {{"use_mkldnn", {true}}}); + {{"use_onednn", {true}}}); op->Run(scope, place); pool.Get(place)->Wait(); diff --git a/test/cpp/fluid/mkldnn/test_onednn_conv2d_transpose_bias.cc b/test/cpp/fluid/mkldnn/test_onednn_conv2d_transpose_bias.cc index 87311b8e9a2acd..6e5218c157f41e 100644 --- a/test/cpp/fluid/mkldnn/test_onednn_conv2d_transpose_bias.cc +++ b/test/cpp/fluid/mkldnn/test_onednn_conv2d_transpose_bias.cc @@ -61,7 +61,7 @@ void test_conv2d_transpose_bias() { AddVarToScope("convtranspose-Bias", &scope, {256}); AddVarToScope("convtranspose-Out", &scope, {1, 256, 27, 23}); - desc.SetAttr("use_mkldnn", true); + desc.SetAttr("use_onednn", true); desc.SetAttr("is_test", true); auto op = paddle::framework::OpRegistry::CreateOp(desc); diff --git a/test/cpp/fluid/mkldnn/test_onednn_op_inplace.cc b/test/cpp/fluid/mkldnn/test_onednn_op_inplace.cc index 54ff2aa51bb8e4..90e296790107e2 100644 --- a/test/cpp/fluid/mkldnn/test_onednn_op_inplace.cc +++ b/test/cpp/fluid/mkldnn/test_onednn_op_inplace.cc @@ -87,11 +87,11 @@ bool TestMain(const phi::Place &place, ? framework::OpRegistry::CreateOp(op_type, {{"X", {"x"}}, {"Y", {"x1"}}}, {{"Out", {"y"}}}, - {{"use_mkldnn", {true}}}) + {{"use_onednn", {true}}}) : framework::OpRegistry::CreateOp(op_type, {{"X", {"x"}}}, {{"Out", {"y"}}}, - {{"use_mkldnn", {true}}}); + {{"use_onednn", {true}}}); op_ref->Run(scope, place); pool.Get(place)->Wait(); @@ -104,11 +104,11 @@ bool TestMain(const phi::Place &place, ? framework::OpRegistry::CreateOp(op_type, {{"X", {"x"}}, {"Y", {"x1"}}}, {{"Out", {"x"}}}, - {{"use_mkldnn", {true}}}) + {{"use_onednn", {true}}}) : framework::OpRegistry::CreateOp(op_type, {{"X", {"x"}}}, {{"Out", {"x"}}}, - {{"use_mkldnn", {true}}}); + {{"use_onednn", {true}}}); op->Run(scope, place); phi::DeviceContextPool::Instance().Get(place)->Wait(); diff --git a/test/cpp/fluid/mkldnn/test_onednn_op_nhwc.cc b/test/cpp/fluid/mkldnn/test_onednn_op_nhwc.cc index f946a0aee1f49c..fc3073f1440759 100644 --- a/test/cpp/fluid/mkldnn/test_onednn_op_nhwc.cc +++ b/test/cpp/fluid/mkldnn/test_onednn_op_nhwc.cc @@ -67,7 +67,7 @@ void Test_Pool2d_Transpose_NHWC(const std::string &transpose_type) { {{"pooling_type", {std::string("max")}}, {"ksize", {ksize}}, {"data_format", {std::string("NHWC")}}, - {"use_mkldnn", {true}}}); + {"use_onednn", {true}}}); auto axis = std::vector(4, 0); axis[1] = 2; @@ -77,7 +77,7 @@ void Test_Pool2d_Transpose_NHWC(const std::string &transpose_type) { transpose_type, {{"X", {"y"}}}, {{"Out", {"z"}}}, - {{"axis", {axis}}, {"use_mkldnn", {true}}}); + {{"axis", {axis}}, {"use_onednn", {true}}}); op_pool->Run(scope, p); op_transpose->Run(scope, p); @@ -130,7 +130,7 @@ TEST(test_pool2d_relu_relu_nhwc, cpu_place) { {{"pooling_type", {std::string("max")}}, {"ksize", {ksize}}, {"data_format", {std::string("NHWC")}}, - {"use_mkldnn", {true}}}); + {"use_onednn", {true}}}); auto axis = std::vector(4, 0); axis[1] = 2; @@ -140,10 +140,10 @@ TEST(test_pool2d_relu_relu_nhwc, cpu_place) { "relu", {{"X", {"y"}}}, {{"Out", {"u"}}}, - {{"axis", {axis}}, {"use_mkldnn", {false}}}); + {{"axis", {axis}}, {"use_onednn", {false}}}); auto op_relu2 = framework::OpRegistry::CreateOp( - "relu", {{"X", {"u"}}}, {{"Out", {"z"}}}, {{"use_mkldnn", {true}}}); + "relu", {{"X", {"u"}}}, {{"Out", {"z"}}}, {{"use_onednn", {true}}}); op_pool->Run(scope, p); op_relu1->Run(scope, p); @@ -192,10 +192,10 @@ TEST(test_pool2d_shape_nhwc, cpu_place) { {{"pooling_type", {std::string("max")}}, {"ksize", {ksize}}, {"data_format", {std::string("NHWC")}}, - {"use_mkldnn", {true}}}); + {"use_onednn", {true}}}); auto op_shape = framework::OpRegistry::CreateOp( - "shape", {{"Input", {"y"}}}, {{"Out", {"z"}}}, {{"use_mkldnn", {true}}}); + "shape", {{"Input", {"y"}}}, {{"Out", {"z"}}}, {{"use_onednn", {true}}}); op_pool->Run(scope, p); op_shape->Run(scope, p); diff --git a/test/cpp/fluid/mkldnn/test_onednn_pool_adaptive_op.cc b/test/cpp/fluid/mkldnn/test_onednn_pool_adaptive_op.cc index 684ad2f1cc3775..1e45aad938ca8d 100644 --- a/test/cpp/fluid/mkldnn/test_onednn_pool_adaptive_op.cc +++ b/test/cpp/fluid/mkldnn/test_onednn_pool_adaptive_op.cc @@ -67,7 +67,7 @@ void test_pool2d(bool adaptive, bool ceil_mode, std::string pool_type = "max") { desc.SetAttr("paddings", paddings); desc.SetAttr("adaptive", adaptive); desc.SetAttr("ceil_mode", ceil_mode); - desc.SetAttr("use_mkldnn", true); + desc.SetAttr("use_onednn", true); auto op = paddle::framework::OpRegistry::CreateOp(desc); diff --git a/test/cpp/fluid/mkldnn/test_onednn_squeeze.cc b/test/cpp/fluid/mkldnn/test_onednn_squeeze.cc index 0a5b253e05bcab..b1dfd5ab5d1b79 100644 --- a/test/cpp/fluid/mkldnn/test_onednn_squeeze.cc +++ b/test/cpp/fluid/mkldnn/test_onednn_squeeze.cc @@ -62,7 +62,7 @@ void test_squeeze() { std::vector axes({-2}); desc.SetAttr("axes", axes); - desc.SetAttr("use_mkldnn", true); + desc.SetAttr("use_onednn", true); auto op = paddle::framework::OpRegistry::CreateOp(desc); @@ -86,7 +86,7 @@ void test_squeeze2() { std::vector axes({-1}); desc.SetAttr("axes", axes); - desc.SetAttr("use_mkldnn", true); + desc.SetAttr("use_onednn", true); auto op = paddle::framework::OpRegistry::CreateOp(desc); diff --git a/test/cpp/fluid/op_debug_string_test.cc b/test/cpp/fluid/op_debug_string_test.cc index 5195a53f5826cf..8d797f97e02f47 100644 --- a/test/cpp/fluid/op_debug_string_test.cc +++ b/test/cpp/fluid/op_debug_string_test.cc @@ -37,7 +37,7 @@ TEST(op_debug_str, test_unknown_dtype) { desc.SetOutput(framework::GradVarName("X"), {framework::GradVarName("X")}); desc.SetOutput(framework::GradVarName("Y"), {framework::GradVarName("Y")}); desc.SetAttr("axis", -1); - desc.SetAttr("use_mkldnn", false); + desc.SetAttr("use_onednn", false); auto x_tensor = scope.Var("X")->GetMutable(); x_tensor->Resize(dim); diff --git a/test/cpp/imperative/test_hooks.cc b/test/cpp/imperative/test_hooks.cc index 521e505b98b894..1350bd31539fd9 100644 --- a/test/cpp/imperative/test_hooks.cc +++ b/test/cpp/imperative/test_hooks.cc @@ -104,7 +104,7 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) { NameVarBaseMap ins = {x_pair, y_pair}; NameVarBaseMap outs = {out_pair}; framework::AttributeMap mul_attr_map; - mul_attr_map["use_mkldnn"] = false; + mul_attr_map["use_onednn"] = false; // add VariableWrapper hook x->GradVarBase()->AddVariableWrapperHook( @@ -211,7 +211,7 @@ void GradVarLeafBackwardHookWithGradAccumulatedTest() { NameVarBaseMap ins = {x_pair, y_pair}; NameVarBaseMap outs = {out_xy_pair}; framework::AttributeMap mul_attr_map; - mul_attr_map["use_mkldnn"] = false; + mul_attr_map["use_onednn"] = false; tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true); var_pair z_pair = var_pair("Y", vb_vector(1, z)); diff --git a/test/cpp/imperative/test_prepare_op.cc b/test/cpp/imperative/test_prepare_op.cc index 1393683e44100e..ae1030abac0e27 100644 --- a/test/cpp/imperative/test_prepare_op.cc +++ b/test/cpp/imperative/test_prepare_op.cc @@ -247,7 +247,7 @@ TEST(test_prepare_op, test_complex_eager) { #ifdef PADDLE_WITH_DNNL TEST(test_prepare_op, test_prepare_data_cpu_onednn) { - TestPrepareDataSamePlace({{"use_mkldnn", true}}); + TestPrepareDataSamePlace({{"use_onednn", true}}); } #endif } // namespace imperative diff --git a/test/cpp/imperative/test_tracer.cc b/test/cpp/imperative/test_tracer.cc index 305334c6a92bb7..ecca7eb41eb441 100644 --- a/test/cpp/imperative/test_tracer.cc +++ b/test/cpp/imperative/test_tracer.cc @@ -89,7 +89,7 @@ TEST(test_tracer, test_trace_op) { imperative::NameVarBaseMap ins = {x_pair, y_pair}; imperative::NameVarBaseMap outs = {out_pair}; framework::AttributeMap mul_attr_map; - mul_attr_map["use_mkldnn"] = false; + mul_attr_map["use_onednn"] = false; tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true); #ifndef PADDLE_WITH_XPU @@ -141,7 +141,7 @@ TEST(test_tracer, test_trace_op_with_backward) { imperative::NameVarBaseMap ins = {x_pair, y_pair}; imperative::NameVarBaseMap outs = {out_pair}; framework::AttributeMap mul_attr_map; - mul_attr_map["use_mkldnn"] = false; + mul_attr_map["use_onednn"] = false; tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true); const auto& out_tensor = vout->Var().Get(); for (int i = 0; i < vout->Var().Get().numel(); i++) { @@ -187,7 +187,7 @@ TEST(test_tracer, test_track_backward_output) { imperative::NameVarBaseMap ins = {x_pair, y_pair}; imperative::NameVarBaseMap outs = {out_pair}; framework::AttributeMap mul_attr_map; - mul_attr_map["use_mkldnn"] = false; + mul_attr_map["use_onednn"] = false; tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true); ASSERT_EQ(x_in->GradVarBase()->GradOpNum(), 0UL); ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL); @@ -232,7 +232,7 @@ TEST(test_tracer, test_track_backward_input) { imperative::NameVarBaseMap ins = {x_pair, y_pair}; imperative::NameVarBaseMap outs = {out_pair}; framework::AttributeMap mul_attr_map; - mul_attr_map["use_mkldnn"] = false; + mul_attr_map["use_onednn"] = false; tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true); ASSERT_EQ(x_in->GradVarBase()->GradOpNum(), 0UL); @@ -280,7 +280,7 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) { imperative::NameVarBaseMap ins = {x_pair, y_pair}; imperative::NameVarBaseMap outs = {out_pair}; framework::AttributeMap mul_attr_map; - mul_attr_map["use_mkldnn"] = false; + mul_attr_map["use_onednn"] = false; tracer.TraceOp( "elementwise_add", ins, outs, mul_attr_map, gpu_place, true); @@ -417,7 +417,7 @@ TEST(test_tracer, test_var_without_grad_var) { imperative::NameVarBaseMap ins = {x_pair, y_pair}; imperative::NameVarBaseMap outs = {out_pair}; framework::AttributeMap mul_attr_map; - mul_attr_map["use_mkldnn"] = false; + mul_attr_map["use_onednn"] = false; tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true); const auto& out_tensor = vout->Var().Get(); @@ -636,7 +636,7 @@ TEST(test_tracer, eager_tracer) { imperative::NameTensorMap ins = {x_pair, y_pair}; imperative::NameTensorMap outs = {out_pair}; framework::AttributeMap mul_attr_map; - mul_attr_map["use_mkldnn"] = false; + mul_attr_map["use_onednn"] = false; tracer.TraceOp( "mul", ins, outs, mul_attr_map, place, true); diff --git a/test/cpp/pir/cinn/CMakeLists.txt b/test/cpp/pir/cinn/CMakeLists.txt index 0b3591f64f0f2c..2871d040551ee2 100644 --- a/test/cpp/pir/cinn/CMakeLists.txt +++ b/test/cpp/pir/cinn/CMakeLists.txt @@ -47,6 +47,10 @@ if(WITH_TESTING AND WITH_CINN) paddle_test(eliminate_common_factor_of_local_index_test SRCS eliminate_common_factor_of_local_index_test.cc) + paddle_test(ir_simplify_select_test SRCS ir_simplify_select_test.cc) + + paddle_test(ir_simplify_bound_test SRCS ir_simplify_bound_test.cc) + # DO NOT forget add test name here, otherwise it will not be executed in # CINN CI. set(cinn_unit_tests diff --git a/test/cpp/pir/cinn/adt/index_expr_test.cc b/test/cpp/pir/cinn/adt/index_expr_test.cc index 3bc2f4ab4e7ae3..a38041f669b20b 100644 --- a/test/cpp/pir/cinn/adt/index_expr_test.cc +++ b/test/cpp/pir/cinn/adt/index_expr_test.cc @@ -52,6 +52,7 @@ class TestIndexExpr : public ::testing::Test { ir::Var S4, S5, S6, S7, S8, S9, f; }; + TEST_F(TestIndexExpr, IndexExpr_0) { ir::IndexExpr a(14); ir::IndexExpr b(7); @@ -643,10 +644,11 @@ TEST_F(TestIndexExpr, MatchPattern) { EXPECT_EQ(result9->at("x"), x); EXPECT_EQ(result9->at("y"), y); } + TEST_F(TestIndexExpr, BoundSimplify) { ir::Var S0 = ir::Var("S0"); - ir::Var i = ir::Var(ir::Expr(0), ir::Expr(5), "i"); - ir::Var j = ir::Var(ir::Expr(0), S0, "j"); + ir::Var i = ir::Var(ir::Expr(0), ir::Expr(5), "i"); // i ∈ [0, 5) + ir::Var j = ir::Var(ir::Expr(0), S0, "j"); // j ∈ [0, S0) ir::Expr q0 = i / Expr(5); ir::Expr q1 = i / Expr(4); diff --git a/test/cpp/pir/cinn/adt/iter_simplify_test.cc b/test/cpp/pir/cinn/adt/iter_simplify_test.cc index 248855b703ff3b..b09bc9d6f521c7 100644 --- a/test/cpp/pir/cinn/adt/iter_simplify_test.cc +++ b/test/cpp/pir/cinn/adt/iter_simplify_test.cc @@ -47,11 +47,12 @@ class TestIterSimplify : public ::testing::Test { i_j_k_fused = ir::Var(ir::Expr(0), ir::Expr(64), "i_j_k_fused").set_index(1); var_intervals = { - {"i", CasInterval(i->lower_bound, i->upper_bound)}, - {"j", CasInterval(j->lower_bound, j->upper_bound)}, - {"k", CasInterval(k->lower_bound, k->upper_bound)}, + {"i", CasInterval(i->lower_bound, i->upper_bound - ir::Expr(1))}, + {"j", CasInterval(j->lower_bound, j->upper_bound - ir::Expr(1))}, + {"k", CasInterval(k->lower_bound, k->upper_bound - ir::Expr(1))}, {"i_j_k_fused", - CasInterval(i_j_k_fused->lower_bound, i_j_k_fused->upper_bound)}}; + CasInterval(i_j_k_fused->lower_bound, + i_j_k_fused->upper_bound - ir::Expr(1))}}; }; ir::Var i; diff --git a/test/cpp/pir/cinn/ir_simplify_bound_test.cc b/test/cpp/pir/cinn/ir_simplify_bound_test.cc new file mode 100644 index 00000000000000..42206af0b9d9b7 --- /dev/null +++ b/test/cpp/pir/cinn/ir_simplify_bound_test.cc @@ -0,0 +1,191 @@ +// Copyright (c) 2025 CINN Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/cinn/optim/ir_simplify.h" + +#include + +#include "paddle/cinn/cinn.h" +#include "paddle/cinn/ir/ir.h" +#include "paddle/cinn/ir/ir_base.h" +#include "paddle/cinn/ir/ir_printer.h" +#include "paddle/cinn/ir/op/ir_operators.h" +#include "paddle/cinn/ir/schedule/ir_schedule.h" +#include "paddle/cinn/ir/utils/ir_nodes_collector.h" +#include "paddle/cinn/ir/utils/stmt_converter.h" +#include "paddle/cinn/utils/string.h" + +namespace cinn { +namespace optim { + +/* +i_j_fused: [0ll, 524288ll) +j_0: [0, 128) +Before Normalize: +(j_0 % 128) +After Normalize: +j_0 +*/ +TEST(IRSimplifyBound, SimplifyMod) { + Context::Global().ResetNameId(); + + // Create input IR matching the specified pattern + // Define loop variable + ir::Var var_j_0 = ir::Var(ir::Expr(0), ir::Expr(128), "j_0"); + + // Final expression + ir::Expr expr = ir::Mod::Make(var_j_0, ir::Expr(128)); + + VLOG(6) << "Before Simplify: " << expr; + auto res = expr.as_index().ir::IndexExpr::Normalize( + ir::IndexExpr::OptLevel::kLevel3); + VLOG(6) << "After Simplify: " << res; + + // Expected output verification + std::string expected_ir = R"ROC(j_0)ROC"; + + EXPECT_EQ(utils::GetStreamCnt(res), utils::Trim(expected_ir)); +} + +/* +i_j_fused: [0ll, 524288ll) +j_0: [0, 128) +Before Normalize: +(j_0 / 128) +After Normalize: +0 +*/ +TEST(IRSimplifyBound, SimplifyDiv) { + Context::Global().ResetNameId(); + + // Create input IR matching the specified pattern + // Define loop variable + ir::Var var_j_0 = ir::Var(ir::Expr(0), ir::Expr(128), "j_0"); + + // Final expression + ir::Expr expr = ir::Div::Make(var_j_0, ir::Expr(128)); + + VLOG(6) << "Before Normalize: " << expr; + auto res = expr.as_index().ir::IndexExpr::Normalize( + ir::IndexExpr::OptLevel::kLevel3); + VLOG(6) << "After Normalize: " << res; + + // Expected output verification + std::string expected_ir = R"ROC(0)ROC"; + + EXPECT_EQ(utils::GetStreamCnt(res), utils::Trim(expected_ir)); +} + +/* +i_j_fused: [0ll, 524288ll) +j_0: [0, 128) +Before Normalize: +((((i_j_fused % 16) * 128) + j_0) / 128) +After Normalize: +(i_j_fused % 16) +*/ +TEST(IRSimplifyBound, SimplifyLinearDiv) { + Context::Global().ResetNameId(); + + // Create input IR matching the specified pattern + // Define loop variables + ir::Var var_i_j_fused = ir::Var(ir::Expr(0), ir::Expr(524288), "i_j_fused"); + ir::Var var_j_0 = ir::Var(ir::Expr(0), ir::Expr(128), "j_0"); + + // Final expression + ir::Expr expr = ir::Div::Make( + ir::Add::Make(ir::Mul::Make(ir::Mod::Make(var_i_j_fused, ir::Expr(16)), + ir::Expr(128)), + var_j_0), + ir::Expr(128)); + + VLOG(6) << "Before Normalize: " << expr; + auto res = expr.as_index().ir::IndexExpr::Normalize( + ir::IndexExpr::OptLevel::kLevel3); + VLOG(6) << "After Normalize: " << res; + + // Expected output verification + std::string expected_ir = R"ROC((i_j_fused % 16))ROC"; + + EXPECT_EQ(utils::GetStreamCnt(res), utils::Trim(expected_ir)); +} + +/* +i_j_fused: [0ll, 524288ll) +j_0: [0, 128) +Before Normalize: +((((i_j_fused % 16) * 128) + j_0) % 128) +After Normalize: +j_0 +*/ +TEST(IRSimplifyBound, SimplifyLinearMod) { + Context::Global().ResetNameId(); + + // Create input IR matching the specified pattern + // Define loop variables + ir::Var var_i_j_fused = ir::Var(ir::Expr(0), ir::Expr(524288), "i_j_fused"); + ir::Var var_j_0 = ir::Var(ir::Expr(0), ir::Expr(128), "j_0"); + + // Final expression + ir::Expr expr = ir::Mod::Make( + ir::Add::Make(ir::Mul::Make(ir::Mod::Make(var_i_j_fused, ir::Expr(16)), + ir::Expr(128)), + var_j_0), + ir::Expr(128)); + + VLOG(6) << "Before Normalize: " << expr; + auto res = expr.as_index().ir::IndexExpr::Normalize( + ir::IndexExpr::OptLevel::kLevel3); + VLOG(6) << "After Normalize: " << res; + + // Expected output verification + std::string expected_ir = R"ROC(j_0)ROC"; + + EXPECT_EQ(utils::GetStreamCnt(res), utils::Trim(expected_ir)); +} + +/* +loop_var_2: [0, 32) +loop_var_3: [0, 4) +Before Normalize: +(((loop_var_3 * 32ll) + loop_var_2) / 128ll) +After Normalize: +0 +*/ +TEST(IRSimplifyBound, SimplifyLinearDiv2) { + Context::Global().ResetNameId(); + + // Create input IR matching the specified pattern + // Define loop variables + ir::Var loop_var_2 = ir::Var(ir::Expr(0), ir::Expr(32), "loop_var_2"); + ir::Var loop_var_3 = ir::Var(ir::Expr(0), ir::Expr(4), "loop_var_3"); + + // Final expression + ir::Expr expr = ir::Div::Make( + ir::Add::Make(ir::Mul::Make(loop_var_3, ir::Expr(32)), loop_var_2), + ir::Expr(128)); + + VLOG(6) << "Before Normalize: " << expr; + auto res = expr.as_index().ir::IndexExpr::Normalize( + ir::IndexExpr::OptLevel::kLevel3); + VLOG(6) << "After Normalize: " << res; + + // Expected output verification + std::string expected_ir = R"ROC(0)ROC"; + + EXPECT_EQ(utils::GetStreamCnt(res), utils::Trim(expected_ir)); +} + +} // namespace optim +} // namespace cinn diff --git a/test/cpp/pir/cinn/ir_simplify_select_test.cc b/test/cpp/pir/cinn/ir_simplify_select_test.cc new file mode 100644 index 00000000000000..0f236e9d266865 --- /dev/null +++ b/test/cpp/pir/cinn/ir_simplify_select_test.cc @@ -0,0 +1,336 @@ +// Copyright (c) 2025 CINN Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/cinn/optim/ir_simplify.h" + +#include + +#include "paddle/cinn/cinn.h" +#include "paddle/cinn/ir/ir.h" +#include "paddle/cinn/ir/ir_printer.h" +#include "paddle/cinn/ir/op/ir_operators.h" +#include "paddle/cinn/ir/schedule/ir_schedule.h" +#include "paddle/cinn/ir/utils/ir_nodes_collector.h" +#include "paddle/cinn/ir/utils/stmt_converter.h" +#include "paddle/cinn/utils/string.h" + +namespace cinn { +namespace optim { + +/* +serial for (i, 0ll, 32768ll) { + serial for (j, 0ll, 16ll) { + serial for (reduce_k_0, 0ll, 128ll) { + var_18[i, j] = select((var_18[i, j] > var_17[i, j, reduce_k_0]), +var_18[i, j], var_17[i, j, reduce_k_0]) + } + } + } +} +*/ +TEST(IRSimplifySelect, SimplifySelectToMax) { + Context::Global().ResetNameId(); + + // Create input IR matching the specified pattern + const std::vector shape_2d = {ir::Expr(32768), ir::Expr(16)}; + const std::vector shape_3d = { + ir::Expr(32768), ir::Expr(16), ir::Expr(128)}; + + ir::Tensor var_17 = + ir::_Tensor_::Make("var_17", ir::Float(32), shape_3d, shape_3d); + var_17->WithBuffer("global", "var_17_buffer"); + + ir::Tensor var_18 = + ir::_Tensor_::Make("var_18", ir::Float(32), shape_2d, shape_2d); + var_18->WithBuffer("global", "var_18_buffer"); + + // Define loop variables + ir::Var var_i = ir::Var(ir::Expr(0), ir::Expr(32768), "i"); + ir::Var var_j = ir::Var(ir::Expr(0), ir::Expr(16), "j"); + ir::Var var_reduce_k_0 = ir::Var(ir::Expr(0), ir::Expr(128), "reduce_k_0"); + + // Create innermost reduction loop body + ir::Expr reduce_body = ir::Store::Make( + var_18, + ir::Select::Make( + ir::GT::Make(ir::Load::Make(var_18, {var_i, var_j}), + ir::Load::Make(var_17, {var_i, var_j, var_reduce_k_0})), + ir::Load::Make(var_18, {var_i, var_j}), + ir::Load::Make(var_17, {var_i, var_j, var_reduce_k_0})), + {var_i, var_j}); + + // Create reduction loop + ir::Expr reduce_loop = ir::For::Make(var_reduce_k_0, + ir::Expr(0), + ir::Expr(128), + ir::ForType::Serial, + ir::DeviceAPI::Host, + ir::Block::Make({reduce_body})); + + // Create j loop + ir::Expr j_loop = ir::For::Make(var_j, + ir::Expr(0), + ir::Expr(16), + ir::ForType::Serial, + ir::DeviceAPI::Host, + ir::Block::Make({reduce_loop})); + + // Create i loop + ir::Expr i_loop = ir::For::Make(var_i, + ir::Expr(0), + ir::Expr(32768), + ir::ForType::Serial, + ir::DeviceAPI::Host, + ir::Block::Make({j_loop})); + + // Final expression + ir::Expr expr = ir::Block::Make({i_loop}); + + VLOG(6) << "Before Simplify: " << expr; + Simplify(&expr); + VLOG(6) << "After Simplify: " << expr; + + // Expected output verification + std::string expected_ir = R"ROC({ + serial for (i, 0, 32768) + { + serial for (j, 0, 16) + { + serial for (reduce_k_0, 0, 128) + { + var_18[i, j] = cinn_max(var_17[i, j, reduce_k_0], var_18[i, j]) + } + } + } +})ROC"; + + EXPECT_EQ(utils::GetStreamCnt(expr), utils::Trim(expected_ir)); +} + +/* +serial for (i, 0ll, 32768ll) { + serial for (j, 0ll, 16ll) { + serial for (reduce_k_0, 0ll, 128ll) { + var_18[i, j] = select((var_18[i, j] < var_17[i, j, reduce_k_0]), +var_18[i, j], var_17[i, j, reduce_k_0]) + } + } + } +} +*/ +TEST(IRSimplifySelect, SimplifySelectToMin) { + Context::Global().ResetNameId(); + + // Create input IR matching the specified pattern + const std::vector shape_2d = {ir::Expr(32768), ir::Expr(16)}; + const std::vector shape_3d = { + ir::Expr(32768), ir::Expr(16), ir::Expr(128)}; + + ir::Tensor var_17 = + ir::_Tensor_::Make("var_17", ir::Float(32), shape_3d, shape_3d); + var_17->WithBuffer("global", "var_17_buffer"); + + ir::Tensor var_18 = + ir::_Tensor_::Make("var_18", ir::Float(32), shape_2d, shape_2d); + var_18->WithBuffer("global", "var_18_buffer"); + + // Define loop variables + ir::Var var_i = ir::Var(ir::Expr(0), ir::Expr(32768), "i"); + ir::Var var_j = ir::Var(ir::Expr(0), ir::Expr(16), "j"); + ir::Var var_reduce_k_0 = ir::Var(ir::Expr(0), ir::Expr(128), "reduce_k_0"); + + // Create innermost reduction loop body + ir::Expr reduce_body = ir::Store::Make( + var_18, + ir::Select::Make( + ir::LT::Make(ir::Load::Make(var_18, {var_i, var_j}), + ir::Load::Make(var_17, {var_i, var_j, var_reduce_k_0})), + ir::Load::Make(var_18, {var_i, var_j}), + ir::Load::Make(var_17, {var_i, var_j, var_reduce_k_0})), + {var_i, var_j}); + + // Create reduction loop + ir::Expr reduce_loop = ir::For::Make(var_reduce_k_0, + ir::Expr(0), + ir::Expr(128), + ir::ForType::Serial, + ir::DeviceAPI::Host, + ir::Block::Make({reduce_body})); + + // Create j loop + ir::Expr j_loop = ir::For::Make(var_j, + ir::Expr(0), + ir::Expr(16), + ir::ForType::Serial, + ir::DeviceAPI::Host, + ir::Block::Make({reduce_loop})); + + // Create i loop + ir::Expr i_loop = ir::For::Make(var_i, + ir::Expr(0), + ir::Expr(32768), + ir::ForType::Serial, + ir::DeviceAPI::Host, + ir::Block::Make({j_loop})); + + // Final expression + ir::Expr expr = ir::Block::Make({i_loop}); + + VLOG(6) << "Before Simplify: " << expr; + Simplify(&expr); + VLOG(6) << "After Simplify: " << expr; + + // Expected output verification + std::string expected_ir = R"ROC({ + serial for (i, 0, 32768) + { + serial for (j, 0, 16) + { + serial for (reduce_k_0, 0, 128) + { + var_18[i, j] = cinn_min(var_18[i, j], var_17[i, j, reduce_k_0]) + } + } + } +})ROC"; + + EXPECT_EQ(utils::GetStreamCnt(expr), utils::Trim(expected_ir)); +} + +/* +serial for (i, 0ll, 32768ll) +{ + serial for (j, 0, 16) + { + serial for (j_0, 0, 128) + { + var_45[i, j, j_0)] = select( + (var_18[i, ((((j * 128ll) + j_0) / 128ll) + 0ll)] <= + float32(3.4028234663852886e+38)), + select( + (var_18[i, ((((j * 128ll) + j_0) / 128ll) + 0ll)] >= + float32(9.9999997473787516e-05)), + var_18[i, ((((j * 128ll) + j_0) / 128ll) + 0ll)], + float32(9.9999997473787516e-05) + ), + float32(3.4028234663852886e+38) + ) + } + } +} +*/ +TEST(IRSimplifySelect, SimplifySelectToMinMax) { + Context::Global().ResetNameId(); + + // Create input IR matching the specified pattern + const std::vector shape_2d = {ir::Expr(32768), ir::Expr(16)}; + const std::vector shape_3d = { + ir::Expr(32768), ir::Expr(16), ir::Expr(128)}; + + ir::Tensor var_18 = + ir::_Tensor_::Make("var_18", ir::Float(32), shape_2d, shape_2d); + var_18->WithBuffer("global", "var_18_buffer"); + + ir::Tensor var_45 = + ir::_Tensor_::Make("var_45", ir::Float(32), shape_3d, shape_3d); + var_45->WithBuffer("global", "var_45_buffer"); + + // Define loop variables + ir::Var var_i = ir::Var(ir::Expr(0), ir::Expr(32768), "i"); + ir::Var var_j = ir::Var(ir::Expr(0), ir::Expr(16), "j"); + ir::Var var_j_0 = ir::Var(ir::Expr(0), ir::Expr(128), "j_0"); + + // Create innermost loop body + ir::Expr body = ir::Store::Make( + var_45, + ir::Select::Make( + ir::LE::Make( + ir::Load::Make( + var_18, + {var_i, + ir::Div::Make( + ir::Add::Make(ir::Mul::Make(var_j, ir::Expr(128)), + var_j_0), + ir::Expr(128))}), + ir::Expr(3.4028234663852886e+38f)), + ir::Select::Make( + ir::GE::Make( + ir::Load::Make( + var_18, + {var_i, + ir::Div::Make( + ir::Add::Make(ir::Mul::Make(var_j, ir::Expr(128)), + var_j_0), + ir::Expr(128))}), + ir::Expr(9.9999997473787516e-05f)), + ir::Load::Make( + var_18, + {var_i, + ir::Div::Make( + ir::Add::Make(ir::Mul::Make(var_j, ir::Expr(128)), + var_j_0), + ir::Expr(128))}), + ir::Expr(9.9999997473787516e-05f)), + ir::Expr(3.4028234663852886e+38f)), + {var_i, var_j, var_j_0}); + + // Create j_0 loop + ir::Expr j_0_loop = ir::For::Make(var_j_0, + ir::Expr(0), + ir::Expr(128), + ir::ForType::Serial, + ir::DeviceAPI::Host, + ir::Block::Make({body})); + + // Create j loop + ir::Expr j_loop = ir::For::Make(var_j, + ir::Expr(0), + ir::Expr(16), + ir::ForType::Serial, + ir::DeviceAPI::Host, + ir::Block::Make({j_0_loop})); + + // Create i loop + ir::Expr i_loop = ir::For::Make(var_i, + ir::Expr(0), + ir::Expr(32768), + ir::ForType::Serial, + ir::DeviceAPI::Host, + ir::Block::Make({j_loop})); + + // Final expression + ir::Expr expr = ir::Block::Make({i_loop}); + + VLOG(6) << "Before Simplify: " << expr; + Simplify(&expr); + VLOG(6) << "After Simplify: " << expr; + + // Expected output verification + std::string expected_ir = R"ROC({ + serial for (i, 0, 32768) + { + serial for (j, 0, 16) + { + serial for (j_0, 0, 128) + { + var_45[i, j, j_0] = cinn_min(cinn_max(var_18[i, (((j * 128) + j_0) / 128)], 9.99999975e-05f), 3.40282347e+38f) + } + } + } +})ROC"; + + EXPECT_EQ(utils::GetStreamCnt(expr), utils::Trim(expected_ir)); +} +} // namespace optim +} // namespace cinn diff --git a/test/cpp/pir/cinn/ir_simplify_test.cc b/test/cpp/pir/cinn/ir_simplify_test.cc index e682079e72a90a..485216814f0102 100644 --- a/test/cpp/pir/cinn/ir_simplify_test.cc +++ b/test/cpp/pir/cinn/ir_simplify_test.cc @@ -479,5 +479,98 @@ TEST(IRSimplify, if_fold_EQ_2) { } )ROC")); } + +/* +serial for (i_j_fused, 0ll, 524288ll) +{ + serial for (j_0, 0, 128) + { + var_45[(i_j_fused / 16), (((i_j_fused % 16) * 128) + j_0)] = + pow(2.0f, ceil(log2((0.00223214296f * var_31[0])))) + } + } +*/ +TEST(IRSimplifyPowerCeilLog2BitOpLdexpf, Base) { + Context::Global().ResetNameId(); + + /// Create input IR matching the specified pattern + const std::vector shape_2d = {ir::Expr(32768), ir::Expr(16)}; + const std::vector shape_3d = {ir::Expr(32768), ir::Expr(16)}; + + ir::Tensor var_31 = + ir::_Tensor_::Make("var_31", ir::Float(32), shape_2d, shape_2d); + var_31->WithBuffer("global", "var_31_buffer"); + + ir::Tensor var_45 = + ir::_Tensor_::Make("var_45", ir::Float(32), shape_3d, shape_3d); + var_45->WithBuffer("global", "var_45_buffer"); + + // Define loop variables + ir::Var var_i_j_fused = ir::Var(ir::Expr(0), ir::Expr(524288), "i_j_fused"); + ir::Var var_j_0 = ir::Var(ir::Expr(0), ir::Expr(128), "j_0"); + + // Create innermost loop body + ir::Expr body = ir::Store::Make( + var_45, + ir::Call::Make( + ir::Float(32), // Return type + "pow", // Intrinsic function name + {ir::Expr(2.0f), + ir::Call::Make( + ir::Float(32), + "ceil", + {ir::Call::Make( + ir::Float(32), + "log2", + {ir::Mul::Make(ir::Expr(0.00223214296f), + ir::Load::Make(var_31, {ir::Expr(0)}))}, + {}, + ir::CallType::Intrinsic)}, + {}, + ir::CallType::Intrinsic)}, + {}, + ir::CallType::Intrinsic), + {ir::Div::Make(var_i_j_fused, ir::Expr(16)), + ir::Add::Make(ir::Mul::Make(ir::Mod::Make(var_i_j_fused, ir::Expr(16)), + ir::Expr(128)), + var_j_0)}); + + // Create j_0 loop + ir::Expr j_0_loop = ir::For::Make(var_j_0, + ir::Expr(0), + ir::Expr(128), + ir::ForType::Serial, + ir::DeviceAPI::Host, + ir::Block::Make({body})); + + // Create i_j_fused loop + ir::Expr i_j_fused_loop = ir::For::Make(var_i_j_fused, + ir::Expr(0), + ir::Expr(524288), + ir::ForType::Serial, + ir::DeviceAPI::Host, + ir::Block::Make({j_0_loop})); + + // Final expression + ir::Expr expr = ir::Block::Make({i_j_fused_loop}); + + VLOG(6) << "Before Simplify: " << expr; + cinn::optim::Simplify(&expr); + VLOG(6) << "After Simplify: " << expr; + + // Expected output verification + std::string expected_ir = R"ROC({ + serial for (i_j_fused, 0, 524288) + { + serial for (j_0, 0, 128) + { + var_45[(i_j_fused / 16), (((i_j_fused % 16) * 128) + j_0)] = ldexpf(1.00000000f, ((bitwise_and(right_shift(__float_as_uint((0.00223214296f * var_31[0])), 23), 255) - 127) + select((((bitwise_and(right_shift(__float_as_uint((0.00223214296f * var_31[0])), 23), 255) - 127) != -127) and (bitwise_and(__float_as_uint((0.00223214296f * var_31[0])), 8388607) != 0)), 1, 0))) + } + } +})ROC"; + + EXPECT_EQ(utils::GetStreamCnt(expr), utils::Trim(expected_ir)); +} + } // namespace common } // namespace cinn diff --git a/test/deprecated/cpp/inference/api/analysis_predictor_tester.cc b/test/deprecated/cpp/inference/api/analysis_predictor_tester.cc index 38978395b5ac7c..903cb9357cceea 100644 --- a/test/deprecated/cpp/inference/api/analysis_predictor_tester.cc +++ b/test/deprecated/cpp/inference/api/analysis_predictor_tester.cc @@ -317,7 +317,7 @@ TEST(AnalysisPredictor, bf16_gpu_pass_strategy) { config.SetModel(FLAGS_dirname); config.SwitchIrOptim(true); config.EnableUseGpu(100, 0); - config.EnableMkldnnBfloat16(); + config.EnableOnednnBfloat16(); #ifdef PADDLE_WITH_DNNL if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_core)) ASSERT_EQ(config.onednn_bfloat16_enabled(), true); @@ -332,7 +332,7 @@ TEST(AnalysisPredictor, bf16_gpu_pass_strategy) { TEST(AnalysisPredictor, bf16_pass_strategy) { std::vector passes; PassStrategy passStrategy(passes); - passStrategy.EnableMkldnnBfloat16(); + passStrategy.EnableOnednnBfloat16(); } TEST(AnalysisPredictor, onednn_fc_pass_strategy) { diff --git a/test/deprecated/cpp/inference/api/analyzer_bert_tester_deprecated.cc b/test/deprecated/cpp/inference/api/analyzer_bert_tester_deprecated.cc index ec10b780a35eeb..e30b8f364c7199 100644 --- a/test/deprecated/cpp/inference/api/analyzer_bert_tester_deprecated.cc +++ b/test/deprecated/cpp/inference/api/analyzer_bert_tester_deprecated.cc @@ -69,7 +69,7 @@ TEST(Analyzer_bert, compare) { CompareNativeAndAnalysisWrapper(); } #ifdef PADDLE_WITH_DNNL -TEST(Analyzer_bert, compare_mkldnn) { +TEST(Analyzer_bert, compare_onednn) { auto use_onednn = true; CompareNativeAndAnalysisWrapper(use_onednn); } @@ -210,7 +210,7 @@ AnalysisConfig SetConfig(bool use_onednn, bool use_bfloat16) { config.EnableONEDNN(); } - if (use_bfloat16) config.EnableMkldnnBfloat16(); + if (use_bfloat16) config.EnableOnednnBfloat16(); return config; } diff --git a/test/deprecated/cpp/inference/api/analyzer_image_classification_tester.cc b/test/deprecated/cpp/inference/api/analyzer_image_classification_tester.cc index 47c53d249e00c6..9915fac72873f3 100644 --- a/test/deprecated/cpp/inference/api/analyzer_image_classification_tester.cc +++ b/test/deprecated/cpp/inference/api/analyzer_image_classification_tester.cc @@ -37,11 +37,11 @@ void SetInput(std::vector> *inputs) { } // Easy for profiling independently. -void profile(bool use_mkldnn = false) { +void profile(bool use_onednn = false) { AnalysisConfig cfg; SetConfig(&cfg); - if (use_mkldnn) { + if (use_onednn) { cfg.EnableONEDNN(); if (FLAGS_disable_onednn_fc) { cfg.DisableOnednnFcPasses(); @@ -59,14 +59,14 @@ void profile(bool use_mkldnn = false) { TEST(Analyzer_resnet50, profile) { profile(); } #ifdef PADDLE_WITH_DNNL -TEST(Analyzer_resnet50, profile_mkldnn) { profile(true /* use_mkldnn */); } +TEST(Analyzer_resnet50, profile_onednn) { profile(true /* use_onednn */); } #endif // Compare result of NativeConfig and AnalysisConfig -void compare(bool use_mkldnn = false) { +void compare(bool use_onednn = false) { AnalysisConfig cfg; SetConfig(&cfg); - if (use_mkldnn) { + if (use_onednn) { cfg.EnableONEDNN(); if (FLAGS_disable_onednn_fc) { cfg.DisableOnednnFcPasses(); @@ -81,7 +81,7 @@ void compare(bool use_mkldnn = false) { TEST(Analyzer_resnet50, compare) { compare(); } #ifdef PADDLE_WITH_DNNL -TEST(Analyzer_resnet50, compare_mkldnn) { compare(true /* use_mkldnn */); } +TEST(Analyzer_resnet50, compare_onednn) { compare(true /* use_onednn */); } #endif // Compare Deterministic result diff --git a/test/deprecated/cpp/inference/api/analyzer_transformer_compare_tester.cc b/test/deprecated/cpp/inference/api/analyzer_transformer_compare_tester.cc index 04885a97ec19ba..a4dec2b4755eb5 100644 --- a/test/deprecated/cpp/inference/api/analyzer_transformer_compare_tester.cc +++ b/test/deprecated/cpp/inference/api/analyzer_transformer_compare_tester.cc @@ -19,10 +19,10 @@ namespace inference { namespace analysis { namespace transformer_tester { -void compare(bool use_mkldnn = false) { +void compare(bool use_onednn = false) { AnalysisConfig cfg; SetConfig(&cfg); - if (!use_mkldnn) { + if (!use_onednn) { cfg.DisableONEDNN(); } @@ -34,7 +34,7 @@ void compare(bool use_mkldnn = false) { TEST(Analyzer_Transformer, compare) { compare(); } #ifdef PADDLE_WITH_DNNL -TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); } +TEST(Analyzer_Transformer, compare_onednn) { compare(true /* use_onednn */); } #endif } // namespace transformer_tester diff --git a/test/deprecated/cpp/inference/api/analyzer_transformer_profile_tester.cc b/test/deprecated/cpp/inference/api/analyzer_transformer_profile_tester.cc index 83f2f0041f8cce..6b6579beacc836 100644 --- a/test/deprecated/cpp/inference/api/analyzer_transformer_profile_tester.cc +++ b/test/deprecated/cpp/inference/api/analyzer_transformer_profile_tester.cc @@ -19,11 +19,11 @@ namespace inference { namespace analysis { namespace transformer_tester { -void profile(bool use_mkldnn = false) { +void profile(bool use_onednn = false) { AnalysisConfig cfg; SetConfig(&cfg); std::vector> outputs; - if (use_mkldnn) { + if (use_onednn) { cfg.EnableONEDNN(); } @@ -37,7 +37,7 @@ void profile(bool use_mkldnn = false) { TEST(Analyzer_Transformer, profile) { profile(); } #ifdef PADDLE_WITH_DNNL -TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); } +TEST(Analyzer_Transformer, profile_onednn) { profile(true); } #endif } // namespace transformer_tester diff --git a/test/deprecated/ir/inference/CMakeLists.txt b/test/deprecated/ir/inference/CMakeLists.txt index 86f03ba89d9850..7fcff5451e2d2c 100755 --- a/test/deprecated/ir/inference/CMakeLists.txt +++ b/test/deprecated/ir/inference/CMakeLists.txt @@ -56,7 +56,7 @@ if(WIN32) endif() -if(NOT WITH_MKLDNN +if(NOT WITH_ONEDNN AND NOT TENSORRT_FOUND AND NOT WITH_GPU) foreach(target ${TEST_INFERENCE_CPU_UT}) diff --git a/test/deprecated/ir/inference/auto_scan_test.py b/test/deprecated/ir/inference/auto_scan_test.py index 752b5f32d011ba..16a8dbf24c8f30 100755 --- a/test/deprecated/ir/inference/auto_scan_test.py +++ b/test/deprecated/ir/inference/auto_scan_test.py @@ -226,7 +226,7 @@ def create_inference_config( self, passes: list[str] | None = None, use_gpu: bool = False, - use_mkldnn: bool = False, + use_onednn: bool = False, use_xpu: bool = False, ir_optim: bool | None = None, ): @@ -238,7 +238,7 @@ def create_inference_config( config.switch_ir_optim(ir_optim) if use_gpu: config.enable_use_gpu(100, 0) - if not use_mkldnn: + if not use_onednn: config.disable_onednn() if use_xpu: config.enable_xpu() @@ -337,7 +337,7 @@ def run_test(self, quant=False, *args, **kwargs): def inference_config_str(self, config) -> str: dic = {} enable_onednn = config.onednn_enabled() - dic["use_mkldnn"] = enable_onednn + dic["use_onednn"] = enable_onednn enable_gpu = config.use_gpu() dic["use_gpu"] = enable_gpu return str(dic) @@ -573,7 +573,7 @@ def run_test(self, quant=False, prog_configs=None): def inference_config_str(self, config) -> str: dic = {} enable_onednn = config.onednn_enabled() - dic["use_mkldnn"] = enable_onednn + dic["use_onednn"] = enable_onednn enable_gpu = config.use_gpu() dic['use_gpu'] = enable_gpu enable_xpu = config.use_xpu() diff --git a/test/deprecated/ir/inference/inference_pass_test.py b/test/deprecated/ir/inference/inference_pass_test.py index 739716382f50bd..acf9b68aefa458 100644 --- a/test/deprecated/ir/inference/inference_pass_test.py +++ b/test/deprecated/ir/inference/inference_pass_test.py @@ -129,7 +129,7 @@ def _get_inference_outs(self, config): return outs def _get_analysis_config( - self, use_gpu=False, use_trt=False, use_mkldnn=False + self, use_gpu=False, use_trt=False, use_onednn=False ): ''' Return a new object of AnalysisConfig. @@ -177,7 +177,7 @@ def _get_analysis_config( if self.enable_tensorrt_varseqlen: config.enable_tensorrt_varseqlen() - elif use_mkldnn: + elif use_onednn: config.enable_onednn() if self.enable_onednn_bfloat16: config.enable_onednn_bfloat16() @@ -186,7 +186,7 @@ def _get_analysis_config( def check_output(self, atol=1e-3): ''' Check whether calculating on CPU and GPU, enable TensorRT - or disable TensorRT, enable MKLDNN or disable MKLDNN + or disable TensorRT, enable ONEDNN or disable ONEDNN are all the same. ''' self.assertFalse( @@ -201,7 +201,7 @@ def check_output_with_option( ): ''' Check whether calculating on CPU and GPU, enable TensorRT - or disable TensorRT, enable MKLDNN or disable MKLDNN + or disable TensorRT, enable ONEDNN or disable ONEDNN are all the same. ''' place = base.CUDAPlace(0) if use_gpu else base.CPUPlace() @@ -287,13 +287,13 @@ def check_output_with_option( if (not use_gpu) and self.enable_mkldnn: onednn_outputs = self._get_inference_outs( self._get_analysis_config( - use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn + use_gpu=use_gpu, use_onednn=self.enable_mkldnn ) ) self.assertTrue( len(paddle_outs) == len(onednn_outputs), - "The number of outputs is different between CPU and MKLDNN. ", + "The number of outputs is different between CPU and ONEDNN. ", ) if self.enable_onednn_bfloat16: @@ -304,7 +304,7 @@ def check_output_with_option( onednn_output, rtol=1e-05, atol=atol, - err_msg='Output has diff between CPU and MKLDNN. ', + err_msg='Output has diff between CPU and ONEDNN. ', ) class TensorRTParam: diff --git a/test/deprecated/ir/inference/quant_dequant_test.py b/test/deprecated/ir/inference/quant_dequant_test.py index 69f2ddfaaa4fda..cb3ddc06b76f13 100644 --- a/test/deprecated/ir/inference/quant_dequant_test.py +++ b/test/deprecated/ir/inference/quant_dequant_test.py @@ -190,7 +190,7 @@ def _get_inference_outs(self, config): return outs def _get_analysis_config( - self, use_gpu=False, use_trt=False, use_mkldnn=False + self, use_gpu=False, use_trt=False, use_onednn=False ): ''' Return a new object of AnalysisConfig. @@ -230,7 +230,7 @@ def _get_analysis_config( if self.enable_tensorrt_varseqlen: config.enable_tensorrt_varseqlen() - elif use_mkldnn: + elif use_onednn: config.enable_onednn() if self.enable_onednn_bfloat16: config.enable_onednn_bfloat16() @@ -241,7 +241,7 @@ def check_output_with_option( ): ''' Check whether calculating on CPU and GPU, enable TensorRT - or disable TensorRT, enable MKLDNN or disable MKLDNN + or disable TensorRT, enable ONEDNN or disable ONEDNN are all the same. ''' place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace() @@ -390,13 +390,13 @@ def check_output_with_option( if (not use_gpu) and self.enable_mkldnn: onednn_outputs = self._get_inference_outs( self._get_analysis_config( - use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn + use_gpu=use_gpu, use_onednn=self.enable_mkldnn ) ) self.assertTrue( len(paddle_outs) == len(onednn_outputs), - "The number of outputs is different between CPU and MKLDNN. ", + "The number of outputs is different between CPU and ONEDNN. ", ) if self.enable_onednn_bfloat16: @@ -407,7 +407,7 @@ def check_output_with_option( onednn_output, rtol=1e-05, atol=atol, - err_msg='Output has diff between CPU and MKLDNN. ', + err_msg='Output has diff between CPU and ONEDNN. ', ) class TensorRTParam: diff --git a/test/deprecated/legacy_test/test_batch_norm_op_deprecated.py b/test/deprecated/legacy_test/test_batch_norm_op_deprecated.py index 9c4abf21fab0d2..bed1666fffa63b 100644 --- a/test/deprecated/legacy_test/test_batch_norm_op_deprecated.py +++ b/test/deprecated/legacy_test/test_batch_norm_op_deprecated.py @@ -303,7 +303,7 @@ def test_with_place(place, data_layout, shape): "epsilon": epsilon, "is_test": False, "data_layout": data_layout, - "use_mkldnn": self.use_onednn, + "use_onednn": self.use_onednn, "fuse_with_relu": self.fuse_with_relu, "use_global_stats": self.use_global_stats, } diff --git a/test/deprecated/legacy_test/test_layer_norm_op_deprecated.py b/test/deprecated/legacy_test/test_layer_norm_op_deprecated.py index d487569028ddea..c097e5b3ce8c70 100644 --- a/test/deprecated/legacy_test/test_layer_norm_op_deprecated.py +++ b/test/deprecated/legacy_test/test_layer_norm_op_deprecated.py @@ -142,10 +142,10 @@ def check_forward_backward( has_scale=True, has_bias=True, y_grad_scale=1.0, - use_mkldnn=False, + use_onednn=False, ): def test_with_place( - place, shape, begin_norm_axis, use_mkldnn=use_mkldnn + place, shape, begin_norm_axis, use_onednn=use_onednn ): # attr epsilon = 0.00001 @@ -221,7 +221,7 @@ def test_with_place( attrs={ "epsilon": epsilon, "begin_norm_axis": begin_norm_axis, - "use_mkldnn": use_mkldnn, + "use_onednn": use_onednn, }, ) # generate backward op_desc diff --git a/test/deprecated/legacy_test/test_program_deprecated.py b/test/deprecated/legacy_test/test_program_deprecated.py index 5efba85dc5c0b0..582feeda7aabb2 100644 --- a/test/deprecated/legacy_test/test_program_deprecated.py +++ b/test/deprecated/legacy_test/test_program_deprecated.py @@ -153,7 +153,7 @@ class TestProgramProto(unittest.TestCase): def test_update_op(self): program = build_program() a = program.desc.serialize_to_string() - program.current_block().ops[0]._set_attr('use_mkldnn', True) + program.current_block().ops[0]._set_attr('use_onednn', True) self.assertTrue(program.desc.need_update()) b = program.desc.serialize_to_string() self.assertFalse(a == b) @@ -230,7 +230,7 @@ def test_program_update(self): hash1 = program.desc.cached_hash_str() id1 = id(program) # change mul's attr - program.current_block().ops[0]._set_attr('use_mkldnn', True) + program.current_block().ops[0]._set_attr('use_onednn', True) program.current_block().ops[0]._set_attr('scale_x', 2.0) hash2 = program.desc.cached_hash_str() id2 = id(program) diff --git a/test/deprecated/mkldnn/CMakeLists.txt b/test/deprecated/mkldnn/CMakeLists.txt index 12dfb5eb93d25b..997e554e2cd9de 100644 --- a/test/deprecated/mkldnn/CMakeLists.txt +++ b/test/deprecated/mkldnn/CMakeLists.txt @@ -1,12 +1,12 @@ file( - GLOB TEST_MKLDNN_LISTS + GLOB TEST_ONEDNN_LISTS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") -string(REPLACE ".py" "" TEST_MKLDNN_LISTS "${TEST_MKLDNN_LISTS}") +string(REPLACE ".py" "" TEST_ONEDNN_LISTS "${TEST_ONEDNN_LISTS}") if(WIN32) message(STATUS "Skip tests unrelated to onednn/mkldnn") elseif(WITH_ONEDNN) - foreach(target ${TEST_MKLDNN_LISTS}) + foreach(target ${TEST_ONEDNN_LISTS}) py_test_modules(${target} MODULES ${target}) set_tests_properties(${target} PROPERTIES LABELS "RUN_TYPE=INFER" TIMEOUT 120) diff --git a/test/deprecated/mkldnn/test_activation_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_activation_mkldnn_op_deprecated.py index 4bfa8ff2d99668..b03853ff809151 100644 --- a/test/deprecated/mkldnn/test_activation_mkldnn_op_deprecated.py +++ b/test/deprecated/mkldnn/test_activation_mkldnn_op_deprecated.py @@ -56,7 +56,7 @@ class TestONEDNNReluDim2(TestRelu): def setUp(self): super().setUp() - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} def init_dtype(self): self.dtype = np.float32 @@ -66,7 +66,7 @@ class TestONEDNNRelu_ZeroDim(TestRelu_ZeroDim): def setUp(self): super().setUp() - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} def init_dtype(self): self.dtype = np.float32 @@ -75,7 +75,7 @@ def init_dtype(self): class TestONEDNNRelu6Dim2(TestRelu6): def setUp(self): super().setUp() - self.attrs.update({"use_mkldnn": True}) + self.attrs.update({"use_onednn": True}) self.check_pir_onednn = False def init_dtype(self): @@ -85,7 +85,7 @@ def init_dtype(self): class TestONEDNNRelu6_ZeroDim(TestRelu6_ZeroDim): def setUp(self): super().setUp() - self.attrs.update({"use_mkldnn": True}) + self.attrs.update({"use_onednn": True}) self.check_pir_onednn = False def init_dtype(self): @@ -96,7 +96,7 @@ class TestONEDNNLeakyReluDim2(TestLeakyRelu): def setUp(self): super().setUp() - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False def init_dtype(self): @@ -117,7 +117,7 @@ class TestONEDNNLeakyRelu_ZeroDim(TestLeakyRelu_ZeroDim): def setUp(self): super().setUp() - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False def init_dtype(self): @@ -135,7 +135,7 @@ def setUp(self): self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)} self.outputs = {'Out': out} - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False @@ -150,7 +150,7 @@ def setUp(self): self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)} self.outputs = {'Out': out} - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False @@ -165,7 +165,7 @@ def setUp(self): self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)} self.outputs = {'Out': out} - self.attrs = {"use_mkldnn": True, "approximate": True} + self.attrs = {"use_onednn": True, "approximate": True} self.check_pir_onednn = False @@ -173,7 +173,7 @@ class TestONEDNNTanhDim2(TestTanh): def setUp(self): super().setUp() - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False def init_dtype(self): @@ -184,7 +184,7 @@ class TestONEDNNTanh_ZeroDim(TestTanh_ZeroDim): def setUp(self): super().setUp() - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False def init_dtype(self): @@ -195,7 +195,7 @@ class TestONEDNNSqrtDim2(TestSqrt): def setUp(self): super().setUp() - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False def init_dtype(self): @@ -206,7 +206,7 @@ class TestONEDNNSqrt_ZeroDim(TestSqrt_ZeroDim): def setUp(self): super().setUp() - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False def init_dtype(self): @@ -216,7 +216,7 @@ def init_dtype(self): class TestONEDNNAbsDim2(TestAbs): def setUp(self): super().setUp() - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} def init_dtype(self): self.dtype = np.float32 @@ -226,7 +226,7 @@ class TestONEDNNAbsZeroSize(TestAbs): def setUp(self): super().setUp() self.check_pir_onednn = True - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} def init_shape(self): self.shape = [0, 12, 0] @@ -236,7 +236,7 @@ class TestONEDNNAbsZeroSize1(TestONEDNNAbsZeroSize): def setUp(self): super().setUp() self.check_pir_onednn = True - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} def init_shape(self): self.shape = [0, 12, 0] @@ -245,7 +245,7 @@ def init_shape(self): class TestONEDNNAbs_ZeroDim(TestAbs_ZeroDim): def setUp(self): super().setUp() - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} def init_dtype(self): self.dtype = np.float32 @@ -255,7 +255,7 @@ class TestONEDNNSwishDim2(TestSwish): def setUp(self): super().setUp() - self.attrs["use_mkldnn"] = True + self.attrs["use_onednn"] = True self.check_pir_onednn = False def init_dtype(self): @@ -266,7 +266,7 @@ class TestONEDNNSwish_ZeroDim(TestSwish_ZeroDim): def setUp(self): super().setUp() - self.attrs["use_mkldnn"] = True + self.attrs["use_onednn"] = True self.check_eager = False self.check_pir_onednn = False @@ -277,27 +277,27 @@ def init_dtype(self): class TestONEDNNHardSwishDim2(TestHardSwish): def setUp(self): super().setUp() - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False class TestONEDNNHardSwish_ZeroDim(TestHardSwish_ZeroDim): def setUp(self): super().setUp() - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False class TestONEDNNSigmoidDim2(TestSigmoid): def setUp(self): super().setUp() - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} class TestONEDNNSigmoid_ZeroDim(TestSigmoid_ZeroDim): def setUp(self): super().setUp() - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} class TestONEDNNReluDim4(TestRelu): @@ -311,7 +311,7 @@ def setUp(self): self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)} self.outputs = {'Out': out} - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} def init_dtype(self): self.dtype = np.float32 @@ -328,7 +328,7 @@ def setUp(self): self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)} self.outputs = {'Out': out} - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False def init_dtype(self): @@ -356,7 +356,7 @@ def setUp(self): self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)} self.outputs = {'Out': out} - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False @@ -371,7 +371,7 @@ def setUp(self): self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)} self.outputs = {'Out': out} - self.attrs = {"use_mkldnn": True, "approximate": True} + self.attrs = {"use_onednn": True, "approximate": True} self.check_pir_onednn = False @@ -389,7 +389,7 @@ def setUp(self): self.inputs = {'X': convert_float_to_uint16(x)} self.outputs = {'Out': out} - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False def test_check_output(self): @@ -413,7 +413,7 @@ def setUp(self): self.inputs = {'X': convert_float_to_uint16(x)} self.outputs = {'Out': out} - self.attrs = {"use_mkldnn": True, "approximate": True} + self.attrs = {"use_onednn": True, "approximate": True} self.check_pir_onednn = False def test_check_output(self): @@ -431,7 +431,7 @@ def setUp(self): 'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32") } self.outputs = {'Out': np.tanh(self.inputs['X'])} - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False @@ -443,7 +443,7 @@ def setUp(self): 'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32") } self.outputs = {'Out': np.sqrt(self.inputs['X'])} - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False @@ -456,7 +456,7 @@ def setUp(self): x[np.abs(x) < 0.005] = 0.02 self.inputs = {'X': x} self.outputs = {'Out': np.abs(self.inputs['X'])} - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} def init_dtype(self): self.dtype = np.float32 @@ -487,7 +487,7 @@ def setUp(self): self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)} self.outputs = {'Out': out} - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False def init_dtype(self): @@ -505,7 +505,7 @@ def setUp(self): self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)} self.outputs = {'Out': out} - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False @@ -520,7 +520,7 @@ def setUp(self): self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)} self.outputs = {'Out': out} - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False @@ -533,7 +533,7 @@ def setUp(self): self.inputs = {'X': x} self.outputs = {'Out': out} - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False def test_check_output(self): @@ -554,7 +554,7 @@ def setUp(self): self.inputs = {'X': x} self.outputs = {'Out': out} - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} self.check_pir_onednn = False def test_check_output(self): @@ -574,7 +574,7 @@ def setUp(self): out = 1 / (1 + np.exp(-x)) self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)} self.outputs = {'Out': out} - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} class TestONEDNNEluDefaultAlpha(TestActivation): @@ -586,7 +586,7 @@ def setUp(self): x = np.random.random((5, 5, 4)).astype("float32") self.inputs = {'X': x} - self.attrs = {'use_mkldnn': True, 'alpha': self.alpha} + self.attrs = {'use_onednn': True, 'alpha': self.alpha} self.outputs = { 'Out': np.maximum(0, x) + np.minimum(0, self.alpha * (np.exp(x) - 1)) @@ -606,7 +606,7 @@ def setUp(self): x = np.random.random(()).astype("float32") self.inputs = {'X': x} - self.attrs = {'use_mkldnn': True, 'alpha': self.alpha} + self.attrs = {'use_onednn': True, 'alpha': self.alpha} self.outputs = { 'Out': np.maximum(0, x) + np.minimum(0, self.alpha * (np.exp(x) - 1)) @@ -629,7 +629,7 @@ def setUp(self): x = np.random.random((5, 5, 4)).astype("float32") self.inputs = {'X': x} - self.attrs = {'use_mkldnn': True} + self.attrs = {'use_onednn': True} self.outputs = {'Out': np.exp(x)} self.check_pir_onednn = False @@ -641,7 +641,7 @@ def setUp(self): x = np.random.random(()).astype("float32") self.inputs = {'X': x} - self.attrs = {'use_mkldnn': True} + self.attrs = {'use_onednn': True} self.outputs = {'Out': np.exp(x)} self.check_pir_onednn = False @@ -674,7 +674,7 @@ def test_check(self): class TestONEDNNSoftplusDim2(TestSoftplus): def setUp(self): super().setUp() - self.attrs.update({"use_mkldnn": True}) + self.attrs.update({"use_onednn": True}) self.check_pir_onednn = False def init_dtype(self): @@ -684,7 +684,7 @@ def init_dtype(self): class TestONEDNNSoftplus_ZeroDim(TestSoftplus_ZeroDim): def setUp(self): super().setUp() - self.attrs.update({"use_mkldnn": True}) + self.attrs.update({"use_onednn": True}) def init_dtype(self): self.dtype = np.float32 diff --git a/test/deprecated/mkldnn/test_clip_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_clip_mkldnn_op_deprecated.py index bd9adb38dcc865..3f30cfee0892bd 100644 --- a/test/deprecated/mkldnn/test_clip_mkldnn_op_deprecated.py +++ b/test/deprecated/mkldnn/test_clip_mkldnn_op_deprecated.py @@ -59,7 +59,7 @@ def adjust_op_settings(self): pass def set_attrs(self): - self.attrs = {'min': 7.2, 'max': 9.6, 'use_mkldnn': True} + self.attrs = {'min': 7.2, 'max': 9.6, 'use_onednn': True} def test_check_output(self): self.check_output(check_dygraph=False, check_pir_onednn=True) diff --git a/test/deprecated/mkldnn/test_concat_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_concat_mkldnn_op_deprecated.py index 59e6590b0ddec1..9bef735b1e48a5 100644 --- a/test/deprecated/mkldnn/test_concat_mkldnn_op_deprecated.py +++ b/test/deprecated/mkldnn/test_concat_mkldnn_op_deprecated.py @@ -32,7 +32,7 @@ def setUp(self): self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]} self.attrs = { 'axis': self.axis, - 'use_mkldnn': True, + 'use_onednn': True, 'mkldnn_data_type': self.onednn_data_type, } @@ -117,7 +117,7 @@ def setUp(self): self.inputs = {'X': [(f'x{i}', self.x) for i in range(136)]} self.attrs = { 'axis': self.axis, - 'use_mkldnn': True, + 'use_onednn': True, 'mkldnn_data_type': self.onednn_data_type, } diff --git a/test/deprecated/mkldnn/test_layer_norm_bf16_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_layer_norm_bf16_mkldnn_op_deprecated.py index 9b656f3aa0bf85..52f03f6e3ff22a 100644 --- a/test/deprecated/mkldnn/test_layer_norm_bf16_mkldnn_op_deprecated.py +++ b/test/deprecated/mkldnn/test_layer_norm_bf16_mkldnn_op_deprecated.py @@ -116,7 +116,7 @@ def check_forward( attrs={ "epsilon": epsilon, "begin_norm_axis": begin_norm_axis, - "use_mkldnn": True, + "use_onednn": True, "is_test": with_is_test, }, ) diff --git a/test/deprecated/mkldnn/test_layer_norm_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_layer_norm_mkldnn_op_deprecated.py index a3d56abd628405..226a7602b5c58c 100644 --- a/test/deprecated/mkldnn/test_layer_norm_mkldnn_op_deprecated.py +++ b/test/deprecated/mkldnn/test_layer_norm_mkldnn_op_deprecated.py @@ -126,7 +126,7 @@ def check_forward( attrs={ "epsilon": epsilon, "begin_norm_axis": begin_norm_axis, - "use_mkldnn": True, + "use_onednn": True, "is_test": with_is_test, }, ) diff --git a/test/deprecated/mkldnn/test_prelu_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_prelu_mkldnn_op_deprecated.py index 304830b673fbe5..72e65827acf1a6 100644 --- a/test/deprecated/mkldnn/test_prelu_mkldnn_op_deprecated.py +++ b/test/deprecated/mkldnn/test_prelu_mkldnn_op_deprecated.py @@ -59,7 +59,7 @@ def setUp(self): self.x = np.random.random((2, 4, 5, 5)).astype("float32") + 1 self.init_attrs() self.set_inputs() - self.attrs = {'mode': self.mode, 'use_mkldnn': True} + self.attrs = {'mode': self.mode, 'use_onednn': True} self.set_dtype_attr() self.outputs = {'Out': ref_prelu(self.x, self.alpha, self.mode)} @@ -102,7 +102,7 @@ def setUp(self): self.x = np.random.random(()).astype("float32") self.init_attrs() self.set_inputs() - self.attrs = {'mode': self.mode, 'use_mkldnn': True} + self.attrs = {'mode': self.mode, 'use_onednn': True} self.set_dtype_attr() self.outputs = {'Out': self.x if self.x > 0 else self.x * self.alpha} diff --git a/test/deprecated/mkldnn/test_reduce_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_reduce_mkldnn_op_deprecated.py index 5c44e58f4f33e0..b9f52322bb95ba 100644 --- a/test/deprecated/mkldnn/test_reduce_mkldnn_op_deprecated.py +++ b/test/deprecated/mkldnn/test_reduce_mkldnn_op_deprecated.py @@ -26,7 +26,7 @@ def setUp(self): self.use_onednn = True self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")} self.outputs = {'Out': self.inputs['X'].sum(axis=0)} - self.attrs = {'use_mkldnn': self.use_onednn} + self.attrs = {'use_onednn': self.use_onednn} self.check_pir_onednn = True def test_check_output(self): @@ -53,7 +53,7 @@ def setUp(self): self.op_type = "reduce_sum" self.use_onednn = True self.inputs = {'X': np.random.random((5, 10, 5, 5)).astype("float32")} - self.attrs = {'use_mkldnn': self.use_onednn, 'dim': [2]} + self.attrs = {'use_onednn': self.use_onednn, 'dim': [2]} self.outputs = { 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) } @@ -66,7 +66,7 @@ def setUp(self): self.op_type = "reduce_sum" self.use_onednn = True self.inputs = {'X': np.random.random((5, 10, 5, 3)).astype("float32")} - self.attrs = {'use_mkldnn': self.use_onednn, 'dim': [0, 1, 2, 3]} + self.attrs = {'use_onednn': self.use_onednn, 'dim': [0, 1, 2, 3]} self.outputs = { 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) } @@ -77,7 +77,7 @@ def setUp(self): self.op_type = "reduce_sum" self.use_onednn = True self.inputs = {'X': np.random.random((2, 5, 3, 2, 2)).astype("float32")} - self.attrs = {'dim': (2, 3, 4), 'keep_dim': True, 'use_mkldnn': True} + self.attrs = {'dim': (2, 3, 4), 'keep_dim': True, 'use_onednn': True} self.outputs = { 'Out': self.inputs['X'].sum( axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'] @@ -90,7 +90,7 @@ def setUp(self): self.op_type = "reduce_sum" self.use_onednn = True self.inputs = {'X': np.random.random(()).astype("float32")} - self.attrs = {'use_mkldnn': self.use_onednn, 'dim': []} + self.attrs = {'use_onednn': self.use_onednn, 'dim': []} self.outputs = { 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) } @@ -103,7 +103,7 @@ def setUp(self): self.op_type = "reduce_sum" self.use_onednn = True self.inputs = {'X': np.random.random((2, 5, 3, 2, 2)).astype("float32")} - self.attrs = {'reduce_all': True, 'keep_dim': True, 'use_mkldnn': True} + self.attrs = {'reduce_all': True, 'keep_dim': True, 'use_onednn': True} self.outputs = { 'Out': self.inputs['X'].sum(keepdims=self.attrs['keep_dim']) } @@ -115,7 +115,7 @@ def setUp(self): self.op_type = "reduce_sum" self.use_onednn = True self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")} - self.attrs = {'reduce_all': True, 'use_mkldnn': self.use_onednn} + self.attrs = {'reduce_all': True, 'use_onednn': self.use_onednn} self.outputs = {'Out': self.inputs['X'].sum()} self.check_pir_onednn = False @@ -131,7 +131,7 @@ def setUp(self): self.op_type = "reduce_sum" self.use_onednn = True self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")} - self.attrs = {'dim': (), 'use_mkldnn': self.use_onednn} + self.attrs = {'dim': (), 'use_onednn': self.use_onednn} self.outputs = {'Out': np.copy(self.inputs['X'])} @@ -146,7 +146,7 @@ def setUp(self): self.op_type = "reduce_max" self.use_onednn = True self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")} - self.attrs = {'dim': [-1], 'use_mkldnn': self.use_onednn} + self.attrs = {'dim': [-1], 'use_onednn': self.use_onednn} self.outputs = { 'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim'])) } @@ -161,7 +161,7 @@ def setUp(self): self.op_type = "reduce_max" self.use_onednn = True self.inputs = {'X': np.random.random(()).astype("float32")} - self.attrs = {'use_mkldnn': self.use_onednn, 'dim': []} + self.attrs = {'use_onednn': self.use_onednn, 'dim': []} self.outputs = { 'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim'])) } @@ -180,7 +180,7 @@ def setUp(self): self.op_type = "reduce_max" self.use_onednn = True self.inputs = {'X': np.random.random((5, 6, 10, 9)).astype("float32")} - self.attrs = {'dim': [-1, 0, 1], 'use_mkldnn': self.use_onednn} + self.attrs = {'dim': [-1, 0, 1], 'use_onednn': self.use_onednn} self.outputs = { 'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim'])) } @@ -197,7 +197,7 @@ def setUp(self): self.op_type = "reduce_min" self.use_onednn = True self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")} - self.attrs = {'dim': [2], 'use_mkldnn': self.use_onednn} + self.attrs = {'dim': [2], 'use_onednn': self.use_onednn} self.outputs = { 'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim'])) } @@ -212,7 +212,7 @@ def setUp(self): self.op_type = "reduce_min" self.use_onednn = True self.inputs = {'X': np.random.random(()).astype("float32")} - self.attrs = {'use_mkldnn': self.use_onednn, 'dim': []} + self.attrs = {'use_onednn': self.use_onednn, 'dim': []} self.outputs = { 'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim'])) } @@ -223,7 +223,7 @@ def setUp(self): self.op_type = "reduce_mean" self.use_onednn = True self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")} - self.attrs = {'dim': [0], 'use_mkldnn': self.use_onednn} + self.attrs = {'dim': [0], 'use_onednn': self.use_onednn} self.outputs = { 'Out': self.inputs['X'].sum(axis=0) / self.inputs['X'].shape[0] } @@ -234,7 +234,7 @@ def setUp(self): self.op_type = "reduce_mean" self.use_onednn = True self.inputs = {'X': np.random.random(()).astype("float32")} - self.attrs = {'use_mkldnn': self.use_onednn, 'dim': []} + self.attrs = {'use_onednn': self.use_onednn, 'dim': []} self.outputs = { # scalar mean is equal to sum 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) @@ -246,7 +246,7 @@ def setUp(self): self.op_type = "reduce_mean" self.use_onednn = True self.inputs = {'X': np.random.random((5, 6, 8, 10)).astype("float32")} - self.attrs = {'reduce_all': True, 'use_mkldnn': self.use_onednn} + self.attrs = {'reduce_all': True, 'use_onednn': self.use_onednn} self.outputs = { 'Out': self.inputs['X'].sum() / np.asarray(self.inputs['X'].shape).prod() diff --git a/test/deprecated/mkldnn/test_reshape_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_reshape_mkldnn_op_deprecated.py index be2c1c948a19cd..8f48abd784a29d 100644 --- a/test/deprecated/mkldnn/test_reshape_mkldnn_op_deprecated.py +++ b/test/deprecated/mkldnn/test_reshape_mkldnn_op_deprecated.py @@ -36,7 +36,7 @@ def setUp(self): 'XShape': np.random.random(self.ori_shape).astype("float32"), } self.x = self.inputs["X"] - self.attrs['use_mkldnn'] = True + self.attrs['use_onednn'] = True self.set_additional_inputs() self.set_outputs() @@ -208,7 +208,7 @@ def setUp(self): super().setUp() self.dtype = np.uint16 self.inputs = {"X": convert_float_to_uint16(self.x)} - self.attrs['use_mkldnn'] = True + self.attrs['use_onednn'] = True def calculate_grads(self): self.dout = self.outputs['Out'] diff --git a/test/deprecated/mkldnn/test_scale_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_scale_mkldnn_op_deprecated.py index 1d50d92e8e4581..9570bb2091edb8 100644 --- a/test/deprecated/mkldnn/test_scale_mkldnn_op_deprecated.py +++ b/test/deprecated/mkldnn/test_scale_mkldnn_op_deprecated.py @@ -25,7 +25,7 @@ def setUp(self): self.init_shape() self.op_type = "scale" self.inputs = {'X': np.random.random(self.shape).astype(np.float32)} - self.attrs = {'scale': -2.3, 'use_mkldnn': True, 'bias': 0.2} + self.attrs = {'scale': -2.3, 'use_onednn': True, 'bias': 0.2} self.use_onednn = True self.outputs = { 'Out': (self.inputs['X'] * self.attrs['scale']) + self.attrs['bias'] @@ -54,7 +54,7 @@ def setUp(self): self.inputs = {'X': np.random.random((10, 10)).astype(np.float32)} self.attrs = { 'scale': 1.5, - 'use_mkldnn': True, + 'use_onednn': True, 'bias': 2.3, 'bias_after_scale': False, } diff --git a/test/deprecated/mkldnn/test_softmax_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_softmax_mkldnn_op_deprecated.py index 6056535c6d9eb2..645d1e675e6bad 100644 --- a/test/deprecated/mkldnn/test_softmax_mkldnn_op_deprecated.py +++ b/test/deprecated/mkldnn/test_softmax_mkldnn_op_deprecated.py @@ -67,7 +67,7 @@ def setUp(self): self.attrs = { 'axis': self.axis, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } def test_check_output(self): diff --git a/test/deprecated/mkldnn/test_split_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_split_mkldnn_op_deprecated.py index 3a01f29aa0d305..95d65ed46e8699 100644 --- a/test/deprecated/mkldnn/test_split_mkldnn_op_deprecated.py +++ b/test/deprecated/mkldnn/test_split_mkldnn_op_deprecated.py @@ -52,7 +52,7 @@ def setUp(self): self.init_data_type() self.init_test_case() self.inputs = {'X': self.x} - self.attrs = {'use_mkldnn': True, 'num': self.num} + self.attrs = {'use_onednn': True, 'num': self.num} if self.axis is not None: self.attrs['axis'] = self.axis diff --git a/test/deprecated/mkldnn/test_sum_mkldnn_op_deprecated.py b/test/deprecated/mkldnn/test_sum_mkldnn_op_deprecated.py index a00e1c6096757d..3ca84284f7f3f6 100644 --- a/test/deprecated/mkldnn/test_sum_mkldnn_op_deprecated.py +++ b/test/deprecated/mkldnn/test_sum_mkldnn_op_deprecated.py @@ -32,7 +32,7 @@ def setUp(self): self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]} y = x0 + x1 + x2 self.outputs = {'Out': y} - self.attrs = {'use_mkldnn': self.use_onednn} + self.attrs = {'use_onednn': self.use_onednn} def init_data_type(self): self.dtype = np.float32 @@ -73,7 +73,7 @@ def test_check_output(self): tensor.set(var_value, place) sum_op = Operator( - "sum", X=["x0", "x1"], Out=out_var_name, use_mkldnn=True + "sum", X=["x0", "x1"], Out=out_var_name, use_onednn=True ) expected_out = np.array(self.x0 + self.x1) sum_op.run(scope, place) diff --git a/test/deprecated/quantization/CMakeLists.txt b/test/deprecated/quantization/CMakeLists.txt index c5b4d9d3a67137..dbf0dbd0806a43 100644 --- a/test/deprecated/quantization/CMakeLists.txt +++ b/test/deprecated/quantization/CMakeLists.txt @@ -5,13 +5,13 @@ file( string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") function(_inference_analysis_python_api_int8_test target model_dir data_path - filename use_mkldnn) + filename use_onednn) py_test( ${target} SRCS ${filename} ENVS CPU_NUM_THREADS=${CPU_NUM_THREADS_ON_CI} - FLAGS_use_onednn=${use_mkldnn} + FLAGS_use_onednn=${use_onednn} ARGS --infer_model ${model_dir}/model @@ -207,7 +207,7 @@ if(NOT WITH_GPU) list(REMOVE_ITEM TEST_OPS test_apply_per_channel_scale) endif() -if(LINUX AND WITH_MKLDNN) +if(LINUX AND WITH_ONEDNN) #### Image classification dataset: ImageNet (small) # The dataset should already be downloaded for INT8v2 unit tests diff --git a/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py b/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py index d7221b53ecbd50..2a73ad7154f4fe 100644 --- a/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py +++ b/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py @@ -64,7 +64,7 @@ def prepare_program_mul(self, program): type=self.op_name(), inputs={"X": block.var('mul_input'), "Y": block.var('mul_weights')}, outputs={"Out": block.var('mul_output')}, - attrs={'use_mkldnn': self.use_onednn}, + attrs={'use_onednn': self.use_onednn}, ) def test_dequantize_op_weights(self): @@ -179,7 +179,7 @@ def prepare_program_conv2d(self, program): 'groups': self.groups, 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'data_format': self.data_format, 'fuse_relu': True, }, @@ -197,7 +197,7 @@ def prepare_program_conv2d(self, program): 'groups': self.groups, 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'data_format': self.data_format, }, ) @@ -312,7 +312,7 @@ def prepare_program(self, program): 'groups': self.groups, 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'data_format': self.data_format, 'fuse_relu': True, }, @@ -329,7 +329,7 @@ def prepare_program(self, program): 'out_w': self.out_w, 'scale': self.scale, 'data_layout': self.data_layout, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, }, ) block.append_op( diff --git a/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py b/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py index addd9aad1179b9..2100bdccaa4857 100644 --- a/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py +++ b/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py @@ -60,7 +60,7 @@ def conv_net(img, label): return avg_loss -class TestMKLDNNTransformBasedFreezePass(unittest.TestCase): +class TestONEDNNTransformBasedFreezePass(unittest.TestCase): def setUp(self): self.quantizable_op_and_inputs = { 'conv2d': ['Input', 'Filter'], diff --git a/test/dygraph_to_static/simnet_dygraph_model.py b/test/dygraph_to_static/simnet_dygraph_model.py index 35262bd77e8397..a3e19de4cc3670 100644 --- a/test/dygraph_to_static/simnet_dygraph_model.py +++ b/test/dygraph_to_static/simnet_dygraph_model.py @@ -410,7 +410,7 @@ def forward(self, input): type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias}, - attrs={"use_mkldnn": False}, + attrs={"use_onednn": False}, ) if self._b is not None: diff --git a/test/ir/inference/inference_pass_test.py b/test/ir/inference/inference_pass_test.py index 34bdfb4d2c16c5..ae823dfeea9ad9 100644 --- a/test/ir/inference/inference_pass_test.py +++ b/test/ir/inference/inference_pass_test.py @@ -37,7 +37,7 @@ def __init__(self, methodName='runTest'): self.feeds = None self.fetch_list = None - self.enable_mkldnn = False + self.enable_onednn = False self.enable_onednn_bfloat16 = False self.enable_trt = False self.enable_tensorrt_varseqlen = False @@ -130,7 +130,7 @@ def _get_inference_outs(self, config): return outs def _get_analysis_config( - self, use_gpu=False, use_trt=False, use_mkldnn=False + self, use_gpu=False, use_trt=False, use_onednn=False ): ''' Return a new object of AnalysisConfig. @@ -178,7 +178,7 @@ def _get_analysis_config( if self.enable_tensorrt_varseqlen: config.enable_tensorrt_varseqlen() - elif use_mkldnn: + elif use_onednn: config.enable_onednn() if self.enable_onednn_bfloat16: config.enable_onednn_bfloat16() @@ -286,10 +286,10 @@ def check_output_with_option( ) # Check whether the onednn results and the CPU results are the same. - if (not use_gpu) and self.enable_mkldnn: + if (not use_gpu) and self.enable_onednn: onednn_outputs = self._get_inference_outs( self._get_analysis_config( - use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn + use_gpu=use_gpu, use_onednn=self.enable_onednn ) ) diff --git a/test/ir/inference/quant_dequant_test.py b/test/ir/inference/quant_dequant_test.py index f955273a88667f..1091e0282fb74a 100644 --- a/test/ir/inference/quant_dequant_test.py +++ b/test/ir/inference/quant_dequant_test.py @@ -46,7 +46,7 @@ def __init__(self, methodName='runTest'): self.test_startup_program = paddle.static.Program() self.feeds = None self.fetch_list = None - self.enable_mkldnn = False + self.enable_onednn = False self.enable_onednn_bfloat16 = False self.enable_trt = False self.enable_tensorrt_varseqlen = True @@ -190,7 +190,7 @@ def _get_inference_outs(self, config): return outs def _get_analysis_config( - self, use_gpu=False, use_trt=False, use_mkldnn=False + self, use_gpu=False, use_trt=False, use_onednn=False ): ''' Return a new object of AnalysisConfig. @@ -230,7 +230,7 @@ def _get_analysis_config( if self.enable_tensorrt_varseqlen: config.enable_tensorrt_varseqlen() - elif use_mkldnn: + elif use_onednn: config.enable_onednn() if self.enable_onednn_bfloat16: config.enable_onednn_bfloat16() @@ -388,10 +388,10 @@ def check_output_with_option( ) # Check whether the onednn results and the CPU results are the same. - if (not use_gpu) and self.enable_mkldnn: + if (not use_gpu) and self.enable_onednn: onednn_outputs = self._get_inference_outs( self._get_analysis_config( - use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn + use_gpu=use_gpu, use_onednn=self.enable_onednn ) ) diff --git a/test/ir/inference/test_conv_act_onednn_fuse_pass.py b/test/ir/inference/test_conv_act_onednn_fuse_pass.py index 8392b19875abfa..4c7b0d2e1cc5aa 100755 --- a/test/ir/inference/test_conv_act_onednn_fuse_pass.py +++ b/test/ir/inference/test_conv_act_onednn_fuse_pass.py @@ -207,7 +207,7 @@ def sample_program_config(self, draw): groups=groups, dilations=dilations, data_format=data_format, - use_mkldnn=True, + use_onednn=True, ) ops = [conv2d_op, act_op] diff --git a/test/ir/inference/test_conv_bn_fuse_pass.py b/test/ir/inference/test_conv_bn_fuse_pass.py index 9cfd09d53ca9e7..d4861008858257 100644 --- a/test/ir/inference/test_conv_bn_fuse_pass.py +++ b/test/ir/inference/test_conv_bn_fuse_pass.py @@ -108,7 +108,7 @@ def generate_bn_Var(): groups=groups, paddings=paddings, strides=strides, - use_mkldnn=use_onednn, + use_onednn=use_onednn, has_bias=False, is_test=True, ) @@ -158,7 +158,7 @@ def generate_bn_Var(): def sample_predictor_configs(self, program_config): # for onednn - if program_config.ops[0].attrs['use_mkldnn']: + if program_config.ops[0].attrs['use_onednn']: config = self.create_inference_config(use_onednn=True) yield config, ['fused_conv2d'], (1e-5, 1e-5) else: diff --git a/test/ir/inference/test_conv_transpose_bn_fuse_pass.py b/test/ir/inference/test_conv_transpose_bn_fuse_pass.py index 31e9bc98973814..99fddb614697ef 100644 --- a/test/ir/inference/test_conv_transpose_bn_fuse_pass.py +++ b/test/ir/inference/test_conv_transpose_bn_fuse_pass.py @@ -67,7 +67,7 @@ def sample_program_config(self, draw): st.sampled_from(["EXPLICIT", "SAME", "VALID"]) ) random_data_layout = draw(st.sampled_from(["NCHW", "NHWC"])) - random_use_mkldnn = draw(st.booleans()) + random_use_onednn = draw(st.booleans()) random_output_size = [] random_filter = draw( st.lists( @@ -133,7 +133,7 @@ def generate_batch_norm_Variance(): 'data_format': random_data_layout, 'output_size': random_output_size, 'output_padding': random_output_size, - 'use_mkldnn': random_use_mkldnn, + 'use_mkldnn': random_use_onednn, 'is_test': True, }, ) @@ -160,7 +160,7 @@ def generate_batch_norm_Variance(): 'is_test': True, 'trainable_statistics': False, 'data_layout': random_data_layout, - 'use_mkldnn': random_use_mkldnn, + 'use_mkldnn': random_use_onednn, }, ) diff --git a/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py b/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py index 50b19a7ffba3a4..216b661156b76e 100644 --- a/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py +++ b/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py @@ -71,7 +71,7 @@ def sample_program_config(self, draw): st.sampled_from(["EXPLICIT", "SAME", "VALID"]) ) random_data_layout = draw(st.sampled_from(["NCHW", "NHWC"])) - random_use_mkldnn = draw(st.booleans()) + random_use_onednn = draw(st.booleans()) random_output_size = [] random_filter = draw( st.lists( @@ -141,7 +141,7 @@ def generate_batch_norm_Variance(): 'data_format': random_data_layout, 'output_size': random_output_size, 'output_padding': random_output_size, - 'use_mkldnn': random_use_mkldnn, + 'use_mkldnn': random_use_onednn, 'is_test': True, }, ) @@ -182,7 +182,7 @@ def generate_batch_norm_Variance(): 'is_test': True, 'trainable_statistics': False, 'data_layout': random_data_layout, - 'use_mkldnn': random_use_mkldnn, + 'use_mkldnn': random_use_onednn, }, ) diff --git a/test/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py b/test/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py index 80cd83e79f8338..cd01ad161725ae 100644 --- a/test/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py +++ b/test/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py @@ -99,7 +99,7 @@ def sample_program_config(self, draw): padding_weights=False, activation_type="", use_quantizer=False, - use_mkldnn=False, + use_onednn=False, ) add_op = OpConfig( "elementwise_add", diff --git a/test/ir/inference/test_mkldnn_int8_scale_calculation_pass.py b/test/ir/inference/test_mkldnn_int8_scale_calculation_pass.py index 6c61d24ac269f8..456a0781118b54 100644 --- a/test/ir/inference/test_mkldnn_int8_scale_calculation_pass.py +++ b/test/ir/inference/test_mkldnn_int8_scale_calculation_pass.py @@ -123,7 +123,7 @@ def sample_program_config(self, draw): bias_shape = [f_shape[0]] inputs = {} weights = {} - use_mkldnn = True + use_onednn = True has_bias = draw(st.booleans()) if has_bias: @@ -154,7 +154,7 @@ def sample_program_config(self, draw): groups=groups, dilations=dilations, data_format=data_format, - use_mkldnn=use_mkldnn, + use_onednn=use_onednn, mkldnn_data_type="int8", ) diff --git a/test/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py index eb73fa54ae6806..e53c32bcdaf298 100644 --- a/test/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py +++ b/test/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py @@ -140,7 +140,7 @@ def generate_input(type): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, + use_onednn=True, passes=[ 'matmul_activation_onednn_fuse_pass', 'operator_scale_onednn_fuse_pass', diff --git a/test/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py index 278b2b4102cf2d..252378c60b36d5 100644 --- a/test/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py +++ b/test/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py @@ -131,7 +131,7 @@ def generate_input(): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, + use_onednn=True, passes=[ 'matmul_elementwise_add_onednn_fuse_pass', 'matmul_activation_onednn_fuse_pass', diff --git a/test/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py index 0f9db3a18eadb7..96b978d88c5cf7 100644 --- a/test/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py +++ b/test/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py @@ -74,7 +74,7 @@ def generate_input(): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, passes=['matmul_elementwise_add_onednn_fuse_pass'] + use_onednn=True, passes=['matmul_elementwise_add_onednn_fuse_pass'] ) yield config, ['fused_matmul'], (1e-5, 1e-5) @@ -137,7 +137,7 @@ def generate_input(): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, passes=['matmul_elementwise_add_onednn_fuse_pass'] + use_onednn=True, passes=['matmul_elementwise_add_onednn_fuse_pass'] ) yield config, ['fused_matmul'], (1e-5, 1e-5) @@ -203,7 +203,7 @@ def generate_input_redisual(): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, passes=['matmul_elementwise_add_onednn_fuse_pass'] + use_onednn=True, passes=['matmul_elementwise_add_onednn_fuse_pass'] ) yield config, ['fused_matmul'], (1e-5, 1e-5) diff --git a/test/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py index 7ac863e675ac7c..017b7387e5c45f 100644 --- a/test/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py +++ b/test/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py @@ -144,7 +144,7 @@ def generate_input(type): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, + use_onednn=True, passes=[ 'matmul_activation_onednn_fuse_pass', 'operator_scale_onednn_fuse_pass', diff --git a/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py index d6be1efaa34353..cf383495f52c42 100644 --- a/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py +++ b/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py @@ -53,7 +53,7 @@ def generate_input(type): type='matmul_v2', inputs={'X': ['matmul_X'], 'Y': ['matmul_Y']}, outputs={'Out': ['matmul_output']}, - attrs={'use_mkldnn': True}, + attrs={'use_onednn': True}, ) if matmul_as_x: @@ -65,7 +65,7 @@ def generate_input(type): type='elementwise_add', inputs=inputs, outputs={'Out': ['elementwise_add_output']}, - attrs={'axis': axis, 'use_mkldnn': True}, + attrs={'axis': axis, 'use_onednn': True}, ) model_net = [matmul_op, elt_add_op] diff --git a/test/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py b/test/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py index b4181a7e6580e0..0d86d8385d0c28 100644 --- a/test/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py +++ b/test/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py @@ -137,7 +137,7 @@ def generate_input(attrs, type): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, passes=['scale_matmul_fuse_pass'] + use_onednn=True, passes=['scale_matmul_fuse_pass'] ) yield config, ['matmul'], (1e-5, 1e-5) diff --git a/test/ir/inference/test_onednn_conv_bias_fuse_pass.py b/test/ir/inference/test_onednn_conv_bias_fuse_pass.py index ac82c4997da3af..3a1435ad0bc0a8 100644 --- a/test/ir/inference/test_onednn_conv_bias_fuse_pass.py +++ b/test/ir/inference/test_onednn_conv_bias_fuse_pass.py @@ -130,7 +130,7 @@ def sample_program_config(self, draw): conv_bias_shape = [] inputs = {} weights = {} - use_mkldnn = None + use_onednn = None conv_type = 'conv2d' if draw(st.booleans()): conv_bias_shape = [f_shape[0]] @@ -145,7 +145,7 @@ def sample_program_config(self, draw): 'bias': TensorConfig(shape=bias_shape), 'conv_bias': TensorConfig(shape=conv_bias_shape), } - use_mkldnn = True + use_onednn = True else: inputs = { 'Input': ['input_x'], @@ -155,7 +155,7 @@ def sample_program_config(self, draw): 'filter': TensorConfig(shape=f_shape), 'bias': TensorConfig(shape=bias_shape), } - use_mkldnn = False + use_onednn = False conv2d_op = OpConfig( conv_type, @@ -167,7 +167,7 @@ def sample_program_config(self, draw): groups=groups, dilations=dilations, data_format=data_format, - use_mkldnn=use_mkldnn, + use_onednn=use_onednn, ) add_op = OpConfig( diff --git a/test/ir/inference/test_onednn_conv_bn_fuse_pass.py b/test/ir/inference/test_onednn_conv_bn_fuse_pass.py index da95b32fcda80b..18a4da54a54464 100644 --- a/test/ir/inference/test_onednn_conv_bn_fuse_pass.py +++ b/test/ir/inference/test_onednn_conv_bn_fuse_pass.py @@ -23,7 +23,7 @@ class TestOneDNNConvBnFusePass(PassAutoScanTest): def sample_program_config(self, draw): - use_mkldnn = True + use_onednn = True padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"])) groups = draw(st.integers(min_value=1, max_value=3)) data_format = draw(st.sampled_from(["NCHW", "NHWC"])) @@ -78,7 +78,7 @@ def generate_data(shape): groups=groups, paddings=paddings, strides=strides, - use_mkldnn=use_mkldnn, + use_onednn=use_onednn, has_bias=False, is_test=True, ) diff --git a/test/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py b/test/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py index 3d396968a76018..3cf14d3c772c2c 100644 --- a/test/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py +++ b/test/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py @@ -116,7 +116,7 @@ def generate_input(): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, + use_onednn=True, passes=[ 'elementwise_act_onednn_fuse_pass', 'operator_scale_onednn_fuse_pass', diff --git a/test/ir/inference/test_onednn_fc_activation_fuse_pass.py b/test/ir/inference/test_onednn_fc_activation_fuse_pass.py index 84517b6dfc8546..01923c2c3031f2 100644 --- a/test/ir/inference/test_onednn_fc_activation_fuse_pass.py +++ b/test/ir/inference/test_onednn_fc_activation_fuse_pass.py @@ -134,7 +134,7 @@ def generate_input(shape): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, + use_onednn=True, passes=[ "fc_act_onednn_fuse_pass", "operator_scale_onednn_fuse_pass", diff --git a/test/ir/inference/test_onednn_fc_gru_fuse_pass.py b/test/ir/inference/test_onednn_fc_gru_fuse_pass.py index 1b2d7b0be6e4f5..069ed1fe44169d 100644 --- a/test/ir/inference/test_onednn_fc_gru_fuse_pass.py +++ b/test/ir/inference/test_onednn_fc_gru_fuse_pass.py @@ -103,7 +103,7 @@ def generate_input(shape): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, + use_onednn=True, passes=[ 'onednn_placement_pass', 'fc_gru_fuse_pass', diff --git a/test/ir/inference/test_onednn_fc_lstm_fuse_pass.py b/test/ir/inference/test_onednn_fc_lstm_fuse_pass.py index 93e755f4032ff3..933c3477ea8330 100644 --- a/test/ir/inference/test_onednn_fc_lstm_fuse_pass.py +++ b/test/ir/inference/test_onednn_fc_lstm_fuse_pass.py @@ -107,7 +107,7 @@ def generate_data(shape): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, + use_onednn=True, passes=[ 'onednn_placement_pass', 'fc_lstm_fuse_pass', diff --git a/test/ir/inference/test_onednn_multi_gru_fuse_pass.py b/test/ir/inference/test_onednn_multi_gru_fuse_pass.py index 1133504a149caa..9a5dbbf2273a8a 100644 --- a/test/ir/inference/test_onednn_multi_gru_fuse_pass.py +++ b/test/ir/inference/test_onednn_multi_gru_fuse_pass.py @@ -121,7 +121,7 @@ def generate_bias(shape): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, + use_onednn=True, passes=['multi_gru_fuse_pass'], ) yield config, ['multi_gru'], (1e-5, 1e-5) diff --git a/test/ir/inference/test_onednn_multi_gru_seq_fuse_pass.py b/test/ir/inference/test_onednn_multi_gru_seq_fuse_pass.py index dbb1439dda96cb..43a7f1952c8bd1 100644 --- a/test/ir/inference/test_onednn_multi_gru_seq_fuse_pass.py +++ b/test/ir/inference/test_onednn_multi_gru_seq_fuse_pass.py @@ -196,7 +196,7 @@ def generate_bias(shape): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, + use_onednn=True, passes=['multi_gru_fuse_pass', 'multi_gru_seq_fuse_pass'], ) yield config, ['multi_gru'], (1e-5, 1e-5) diff --git a/test/ir/inference/test_onednn_operator_reshape2_fuse_pass.py b/test/ir/inference/test_onednn_operator_reshape2_fuse_pass.py index 758950be6ee678..abd8f90f099632 100644 --- a/test/ir/inference/test_onednn_operator_reshape2_fuse_pass.py +++ b/test/ir/inference/test_onednn_operator_reshape2_fuse_pass.py @@ -75,7 +75,7 @@ def generate_input(shape): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, + use_onednn=True, passes=[ "operator_reshape2_onednn_fuse_pass", ], diff --git a/test/ir/inference/test_onednn_operator_unsqueeze2_fuse_pass.py b/test/ir/inference/test_onednn_operator_unsqueeze2_fuse_pass.py index d1f441f3444cab..f35c355eb0314f 100644 --- a/test/ir/inference/test_onednn_operator_unsqueeze2_fuse_pass.py +++ b/test/ir/inference/test_onednn_operator_unsqueeze2_fuse_pass.py @@ -73,7 +73,7 @@ def generate_input(shape): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, + use_onednn=True, passes=[ "operator_unsqueeze2_onednn_fuse_pass", ], @@ -138,7 +138,7 @@ def generate_input(shape): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, + use_onednn=True, passes=[ "operator_unsqueeze2_onednn_fuse_pass", ], diff --git a/test/ir/inference/test_onednn_quant_transpose_dequant_fuse_pass.py b/test/ir/inference/test_onednn_quant_transpose_dequant_fuse_pass.py index 5c8f89bd5f8063..1ffcbf37b1054f 100644 --- a/test/ir/inference/test_onednn_quant_transpose_dequant_fuse_pass.py +++ b/test/ir/inference/test_onednn_quant_transpose_dequant_fuse_pass.py @@ -65,7 +65,7 @@ def generate_input(): 'use_mkldnn': True, 'mkldnn_data_type': 'int8', }, - use_mkldnn=True, + use_onednn=True, ) transpose2_op_2 = OpConfig( @@ -80,7 +80,7 @@ def generate_input(): 'use_mkldnn': True, 'mkldnn_data_type': 'int8', }, - use_mkldnn=True, + use_onednn=True, ) dequantize_op = OpConfig( @@ -106,7 +106,7 @@ def generate_input(): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, + use_onednn=True, passes=['quant_transpose2_dequant_onednn_fuse_pass'], ) yield config, ['fused_transpose', 'fused_transpose'], (1e-5, 1e-5) diff --git a/test/ir/inference/test_onednn_squeeze2_transpose2_fuse_pass.py b/test/ir/inference/test_onednn_squeeze2_transpose2_fuse_pass.py index 3387f244bd4e8d..3b6f86d7d027dc 100644 --- a/test/ir/inference/test_onednn_squeeze2_transpose2_fuse_pass.py +++ b/test/ir/inference/test_onednn_squeeze2_transpose2_fuse_pass.py @@ -78,7 +78,7 @@ def generate_input(shape): def sample_predictor_configs(self, program_config): config = self.create_inference_config( - use_mkldnn=True, + use_onednn=True, passes=[ "squeeze2_transpose2_onednn_fuse_pass", ], diff --git a/test/legacy_test/hygon_dcu/hygon_llama_ops.py b/test/legacy_test/hygon_dcu/hygon_llama_ops.py index c6f0d6d20aa38d..4ead7b15c39028 100644 --- a/test/legacy_test/hygon_dcu/hygon_llama_ops.py +++ b/test/legacy_test/hygon_dcu/hygon_llama_ops.py @@ -480,7 +480,7 @@ def setUp(self): self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]} y = x0 + x1 + x2 self.outputs = {'Out': y} - self.attrs = {'use_mkldnn': self.use_onednn} + self.attrs = {'use_onednn': self.use_onednn} def init_kernel_type(self): self.dtype = np.float16 @@ -545,7 +545,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} self.outputs = {'Out': self.out} def init_kernel_type(self): @@ -631,7 +631,7 @@ def setUp(self): 'Y': OpTest.np_dtype_to_base_dtype(self.y), } self.outputs = {'Out': self.out} - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} def init_kernel_type(self): self.use_onednn = False diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py index e6eca9654f330e..3a5d26c93b9516 100644 --- a/test/legacy_test/op_test.py +++ b/test/legacy_test/op_test.py @@ -633,8 +633,10 @@ def is_float16_op(self): def is_onednn_op(self): return (hasattr(self, "use_onednn") and self.use_onednn) or ( hasattr(self, "attrs") - and "use_mkldnn" in self.attrs - and self.attrs["use_mkldnn"] + and ( + ("use_mkldnn" in self.attrs and self.attrs["use_mkldnn"]) + or ("use_onednn" in self.attrs and self.attrs["use_onednn"]) + ) ) def is_xpu_op(self): @@ -2198,7 +2200,10 @@ def check_inplace_output_with_place( attrs_use_mkldnn = hasattr(self, 'attrs') and bool( self.attrs.get('use_mkldnn', False) ) - if flags_use_onednn or attrs_use_mkldnn: + attrs_use_onednn = hasattr(self, 'attrs') and bool( + self.attrs.get('use_onednn', False) + ) + if flags_use_onednn or attrs_use_mkldnn or attrs_use_onednn: warnings.warn( "check inplace_grad for ops using mkldnn is not supported" ) @@ -3441,9 +3446,13 @@ def check_grad_with_place( cache_list = self.cache_name_list # oneDNN numeric gradient should use CPU kernel - use_onednn = False + use_mkldnn = False if op_attrs.get("use_mkldnn"): op_attrs["use_mkldnn"] = False + use_mkldnn = True + use_onednn = False + if op_attrs.get("use_onednn"): + op_attrs["use_onednn"] = False use_onednn = True if hasattr(self, "attrs"): for k, v in self.attrs.items(): @@ -3459,8 +3468,10 @@ def check_grad_with_place( cache_list=cache_list, ) - if use_onednn: + if use_mkldnn: op_attrs["use_mkldnn"] = True + if use_onednn: + op_attrs["use_onednn"] = True if no_grad_set is None: no_grad_set = set() diff --git a/test/legacy_test/test_batch_norm_op.py b/test/legacy_test/test_batch_norm_op.py index b8525403e59876..556a3637791e34 100644 --- a/test/legacy_test/test_batch_norm_op.py +++ b/test/legacy_test/test_batch_norm_op.py @@ -317,7 +317,7 @@ def check_with_place(self, place, data_layout, dtype, shape): # attrs is_test=True, data_layout=data_layout, - use_mkldnn=self.use_onednn, + use_onednn=self.use_onednn, fuse_with_relu=self.fuse_with_relu, epsilon=epsilon, ) diff --git a/test/legacy_test/test_broadcast_tensors_op.py b/test/legacy_test/test_broadcast_tensors_op.py index 296aea9b007e3e..dfac9d35108a77 100644 --- a/test/legacy_test/test_broadcast_tensors_op.py +++ b/test/legacy_test/test_broadcast_tensors_op.py @@ -112,7 +112,7 @@ def set_dtype(self): def setUp(self): self.op_type = "broadcast_tensors" self.use_onednn = False - self.attrs = {'use_mkldnn': self.use_onednn} + self.attrs = {'use_onednn': self.use_onednn} self.test_gen_func_list = [ gen_rank_diff_test, gen_no_broadcast_test, @@ -198,7 +198,7 @@ def setUp(self): self.dtype = np.uint16 self.np_dtype = "float32" self.use_onednn = False - self.attrs = {'use_mkldnn': self.use_onednn} + self.attrs = {'use_onednn': self.use_onednn} self.test_gen_func_list = [ gen_rank_diff_test, gen_no_broadcast_test, diff --git a/test/legacy_test/test_compat_minmax.py b/test/legacy_test/test_compat_minmax.py new file mode 100644 index 00000000000000..00245894df0480 --- /dev/null +++ b/test/legacy_test/test_compat_minmax.py @@ -0,0 +1,386 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle + + +class TestCompatMinMaxBase(unittest.TestCase): + """The default base class is for testing min-related ops""" + + def __init__( + self, + *args, + test_op=paddle.compat.min, + origin_op=paddle.min, + index_op=paddle.argmin, + test_op_name="paddle.compat.min", + origin_op_name="paddle.min", + **kwargs, + ): + super().__init__(*args, **kwargs) + paddle.disable_static() + self.test_op = test_op + self.origin_op = origin_op + self.index_op = index_op + self.test_op_name = test_op_name + self.origin_op_name = origin_op_name + + def test_case1_simple_reduce_all(self): + data = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0]], dtype='float32') + val = self.test_op(data) + + if self.test_op_name.endswith("min"): + self.assertAlmostEqual(val.item(), 1.0) + expected_grad = np.array([[0.5, 0.5], [0.0, 0.0]]) + else: + self.assertAlmostEqual(val.item(), 4.0) + expected_grad = np.array([[0.0, 0.0], [0.0, 1.0]]) + + data = paddle.to_tensor( + [[1.0, 1.0], [2.0, 3.0]], dtype='float32', stop_gradient=False + ) + val = self.test_op(data) + val.backward() + + np.testing.assert_allclose(data.grad.numpy(), expected_grad) + + def test_case2_reduce_dim(self): + """Test dim/keepdim""" + data = paddle.to_tensor( + [[[5, 8], [2, 1]], [[7, 3], [9, 6]]], dtype='float32' + ) + if self.test_op_name.endswith("min"): + in_dim = 1 + result = self.test_op(data, dim=in_dim) + expected_res = np.array([[[5, 3], [2, 1]]]) + self.assertEqual(result.values.shape, [2, 2]) + np.testing.assert_array_equal( + result.values.numpy(), np.array([[2, 1], [7, 3]]) + ) + np.testing.assert_array_equal( + result.indices.numpy(), np.array([[1, 1], [0, 0]]) + ) + else: + in_dim = 2 + result = self.test_op(data, dim=in_dim) + expected_res = np.array([[[7, 8], [9, 6]]]) + self.assertEqual(result.values.shape, [2, 2]) + np.testing.assert_array_equal( + result.values.numpy(), np.array([[8, 2], [7, 9]]) + ) + np.testing.assert_array_equal( + result.indices.numpy(), np.array([[1, 0], [0, 0]]) + ) + + result_keep = self.test_op(data, dim=0, keepdim=True) + self.assertEqual(result_keep.values.shape, [1, 2, 2]) + np.testing.assert_array_equal(result_keep.values.numpy(), expected_res) + + result_neg = self.test_op(data, dim=in_dim - 3) + np.testing.assert_array_equal( + result_neg.values.numpy(), result.values.numpy() + ) + + def test_case2_grad(self): + data = paddle.to_tensor( + [[[1.0, 2.0], [1.0, 3.0]], [[4.0, 1.0], [5.0, 1.0]]], + dtype='float32', + stop_gradient=False, + ) + y = data * 2 + + result = self.test_op(y, dim=2) + result.values.backward() + + if self.test_op_name.endswith("min"): + expected_grad = np.array( + [[[2.0, 0.0], [2.0, 0.0]], [[0.0, 2.0], [0.0, 2.0]]] + ) + expected_grad2 = np.array( + [[[2.0, 4.0], [0.0, 0.0]], [[8.0, 2.0], [0.0, 0.0]]] + ) + else: + expected_grad = np.array( + [[[0.0, 2.0], [0.0, 2.0]], [[2.0, 0.0], [2.0, 0.0]]] + ) + expected_grad2 = np.array( + [[[2.0, 0.0], [0.0, 6.0]], [[0.0, 2.0], [10.0, 0.0]]] + ) + np.testing.assert_allclose(data.grad.numpy(), expected_grad, atol=1e-6) + + data.clear_grad() + y = data * data + result = self.test_op(y, dim=1) + result[0].backward() + np.testing.assert_allclose(data.grad.numpy(), expected_grad2, atol=1e-6) + + def test_case3_elementwise(self): + x = paddle.to_tensor([[1, 5], [4, 2]], dtype='float32') + y = paddle.to_tensor([[3, 2], [1, 6]], dtype='float32') + z = paddle.to_tensor([3, 4], dtype='float32') + broadcast_res = self.test_op(x, z) + + result = self.test_op(x, y) + if self.test_op_name.endswith("min"): + np.testing.assert_array_equal( + result.numpy(), np.array([[1, 2], [1, 2]]) + ) + np.testing.assert_array_equal( + broadcast_res.numpy(), np.array([[1, 4], [3, 2]]) + ) + else: + np.testing.assert_array_equal( + result.numpy(), np.array([[3, 5], [4, 6]]) + ) + np.testing.assert_array_equal( + broadcast_res.numpy(), np.array([[3, 5], [4, 4]]) + ) + + def test_case3_grad(self): + x = paddle.to_tensor( + [[1.0, 2.0], [3.0, 4.0]], dtype=paddle.float32, stop_gradient=False + ) + y = paddle.to_tensor( + [[0.5, 2.5], [2.0, 3.5]], dtype=paddle.float32, stop_gradient=False + ) + + val = self.test_op(x, y) + val.backward() + + expected_x_grad = np.array([[0.0, 1.0], [0.0, 0.0]]) + expected_y_grad = np.array([[1.0, 0.0], [1.0, 1.0]]) + if self.test_op_name.endswith("max"): + expected_x_grad = 1 - expected_x_grad + expected_y_grad = 1 - expected_y_grad + + np.testing.assert_allclose(x.grad.numpy(), expected_x_grad) + np.testing.assert_allclose(y.grad.numpy(), expected_y_grad) + + def test_edge_cases(self): + """Edge cases test""" + # uniform distributed gradient + uniform_data = paddle.ones([2, 3], dtype='float64') + uniform_data.stop_gradient = False + val = self.test_op(uniform_data) + val.sum().backward() + # uniformly distributed + expected_grad = np.full((2, 3), 1.0 / 6.0) + np.testing.assert_allclose(uniform_data.grad.numpy(), expected_grad) + + uniform_data.clear_grad() + val = self.test_op(uniform_data, 0) + val.values.sum().backward() + # take_along_axis like gradient behavior + expected_grad = np.array([[1.0, 1.0, 1.0], [0.0, 0.0, 0.0]]) + np.testing.assert_allclose(uniform_data.grad.numpy(), expected_grad) + + # 0-dim tensor + dim0_tensor = paddle.to_tensor(2, dtype='float32') + val = self.test_op(dim0_tensor) + np.testing.assert_allclose(val.numpy(), np.array(2.0, dtype=np.float32)) + + # 1-dim tensor + dim1_tensor = paddle.to_tensor([1], dtype='uint8') + val = self.test_op(dim1_tensor, dim=-1, keepdim=True) + np.testing.assert_array_equal( + val[0].numpy(), np.array([1], dtype=np.uint8) + ) + np.testing.assert_array_equal( + val[1].numpy(), np.array([0], dtype=np.int64) + ) + + def test_compare_with_index_ops_to_origin(self): + dtypes = ['float32', 'float64', 'int32', 'int64', 'uint8'] + cpu_reject_types = {'int16', 'bfloat16', 'float16'} + + for i, dtype in enumerate(dtypes): + data = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], dtype=dtype) + # `bfloat16` and `float16` are rejected on CPU + if not data.place.is_gpu_place() and dtype in cpu_reject_types: + continue + vals_inds = self.test_op(data, dim=0) + self.assertEqual(vals_inds.values.dtype, data.dtype) + self.assertEqual(vals_inds.indices.dtype, paddle.int64) + + origin_indices = self.index_op(data, axis=0, dtype="int64") + if dtype != 'uint8': + origin_values = self.origin_op(data, axis=0) + else: + origin_values = paddle.take_along_axis( + data, origin_indices.unsqueeze(0), axis=0 + ) + origin_values.squeeze_(axis=0) + if i < 4: # floating point + np.testing.assert_allclose( + vals_inds.values.numpy(), origin_values.numpy() + ) + else: + np.testing.assert_array_equal( + vals_inds.values.numpy(), origin_values.numpy() + ) + np.testing.assert_array_equal( + vals_inds[1].numpy(), origin_indices.numpy() + ) + + def test_error_handling(self): + """Test whether correct exception will be thrown. Skip error messages (some of them are long)""" + + err_msg1 = ( + "Tensors with integral type: 'paddle.int32' should stop gradient." + ) + err_msg2 = ( + f"{self.origin_op_name}() received unexpected keyword arguments 'input', 'dim'. " + f"\nDid you mean to use {self.test_op_name}() instead?" + ) + err_msg3 = ( + f"{self.test_op_name}() received unexpected keyword argument 'axis'. " + f"\nDid you mean to use {self.origin_op_name}() instead?" + ) + err_msg4 = ( + "Non-CUDA GPU placed Tensor does not have 'paddle.float16' op registered.\n" + "Paddle support following DataTypes: int32, int64, float64, float32, uint8" + ) + + # empty tensor + empty_tensor = paddle.to_tensor([], dtype='float32') + with self.assertRaises(ValueError): + self.test_op(empty_tensor) + + # mixed parameters case 1 + input_ts = paddle.to_tensor([1, 2, 3], dtype='float32') + other_ts = paddle.to_tensor([1]) + with self.assertRaises(TypeError): + self.test_op(input_ts, other=other_ts, dim=0) + + # mixed parameters case 2 + with self.assertRaises(TypeError): + self.test_op(input_ts, 0, other=other_ts) + + # trying to perform grad ops for integral types + with self.assertRaises(TypeError) as cm: + tensor = paddle.ones([2, 2], dtype=paddle.int32) + tensor.stop_gradient = False + tensors = self.test_op(tensor, dim=0) + self.assertEqual(str(cm.exception), err_msg1) + + # explicit None case 1 + with self.assertRaises(TypeError) as cm: + self.test_op(input_ts, dim=None) + + # explicit None case 2 + with self.assertRaises(TypeError) as cm: + self.test_op(input_ts, None, keepdim=True) + + # keepdim specified without specifying dim + with self.assertRaises(TypeError) as cm: + self.test_op(input_ts, keepdim=True) + + # Wrong *args specification case 1 + with self.assertRaises(TypeError) as cm: + self.test_op(input_ts, False) + + # Wrong *args specification case 2 + with self.assertRaises(TypeError) as cm: + self.test_op(input_ts, other_ts, True) + + # Tensor input for dim case 1 + with self.assertRaises(TypeError) as cm: + self.test_op(input_ts, dim=paddle.to_tensor([0])) + + # Tensor input for dim case 2 + with self.assertRaises(TypeError) as cm: + self.test_op(input_ts, dim=paddle.to_tensor(0)) + + # Duplicate Arguments case 1 + with self.assertRaises(TypeError) as cm: + self.test_op(input_ts, 0, dim=0) + + # Duplicate Arguments case 2 + with self.assertRaises(TypeError) as cm: + self.test_op(input_ts, other_ts, other=0) + + # Duplicate Arguments case 3 + with self.assertRaises(TypeError) as cm: + self.test_op(input_ts, dim=0, other=0, keepdim=True) + + # Wrong API used case 1 + with self.assertRaises(TypeError) as cm: + self.origin_op(input=input_ts, dim=0) + self.assertEqual(str(cm.exception), err_msg2) + + # Wrong API used case 2 + with self.assertRaises(TypeError) as cm: + self.test_op(input_ts, axis=0) + self.assertEqual(str(cm.exception), err_msg3) + + # Rejected on CPU types + with self.assertRaises(TypeError) as cm: + tensor = paddle.to_tensor([1, 2, 3], dtype="float16") + cpu_tensor = tensor.to("cpu") + self.test_op(cpu_tensor, dim=0) + self.assertEqual(str(cm.exception), err_msg4) + + def _compare_with_origin_static(self, input_shape, axis=0, keepdim=False): + if not paddle.is_compiled_with_cuda(): + return + numel = 1 + for v in input_shape: + numel *= v + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + input_tensor = paddle.arange(numel, dtype=paddle.float32).reshape( + input_shape + ) + + y = input_tensor**2 + input_tensor + values, indices = self.test_op(y, dim=axis, keepdim=keepdim) + values += 1 + + gt_values = self.origin_op(y, axis=axis, keepdim=keepdim) + 1 + gt_indices = self.index_op(y, axis=axis, keepdim=keepdim) + + place = paddle.CUDAPlace(0) + exe = paddle.static.Executor(place) + values_np, indices_np, gt_values_np, gt_indices_np = exe.run( + fetch_list=[values, indices, gt_values, gt_indices] + ) + np.testing.assert_allclose(values_np, gt_values_np) + np.testing.assert_equal(indices_np, gt_indices_np) + paddle.disable_static() + + def test_static_graph(self): + self._compare_with_origin_static([3, 10, 2], axis=1) + self._compare_with_origin_static([3, 10, 2], axis=0, keepdim=True) + self._compare_with_origin_static([17], axis=0) + + +class TestCompatMax(TestCompatMinMaxBase): + def __init__(self, *args, **kwargs): + super().__init__( + *args, + test_op=paddle.compat.max, + origin_op=paddle.max, + index_op=paddle.argmax, + test_op_name="paddle.compat.max", + origin_op_name="paddle.max", + **kwargs, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/legacy_test/test_compat_split.py b/test/legacy_test/test_compat_split.py new file mode 100644 index 00000000000000..8410e10e1e1caf --- /dev/null +++ b/test/legacy_test/test_compat_split.py @@ -0,0 +1,177 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle.compat import split + + +class TestCompatSplit(unittest.TestCase): + def _compare_with_origin(self, input_tensor, size, axis=0): + pd_results = split(input_tensor, size, dim=axis) + + if isinstance(size, int): + shape_on_axis = input_tensor.shape[axis] + remaining_num = shape_on_axis % size + num_sections = shape_on_axis // size + if remaining_num == 0: + size = num_sections + else: + size = [size for _ in range(num_sections)] + size.append(remaining_num) + + origin_results = paddle.split( + input_tensor, num_or_sections=size, axis=axis + ) + + self.assertEqual(len(origin_results), len(pd_results)) + + # check shape and output section size of the output + for origin_ts, pd_ts in zip(origin_results, pd_results): + np.testing.assert_allclose(origin_ts.numpy(), pd_ts.numpy()) + + def test_basic_split(self): + """Test basic splitting with integer size""" + data = paddle.arange(12).reshape([3, 4]).astype('float32') + self._compare_with_origin(data, 1, 0) + self._compare_with_origin(data, 2, 1) + + def test_split_with_list_sections(self): + """Test splitting with list of section sizes""" + data = paddle.rand([10, 5]) + self._compare_with_origin(data, [3, 2, 5], 0) + self._compare_with_origin(data, [1, 4], -1) + + def test_chained_operations(self): + """Test split with complex operation chain""" + x = paddle.rand([8, 12]) + y = paddle.sin(x) * 2.0 + paddle.exp(x) / 3.0 + z = paddle.nn.functional.relu(y) + + z1, z2 = split(z, 7, dim=1) + + self.assertEqual(z1.shape, [8, 7]) + self.assertEqual(z2.shape, [8, 5]) + + z_np = z.numpy() + np.testing.assert_allclose(z_np[:, :7], z1.numpy()) + np.testing.assert_allclose(z_np[:, 7:], z2.numpy()) + + def test_split_grad(self): + """Test backprop for split, in1 and in2 are computed by + compat.split and original split""" + + def get_tensors(): + np.random.seed(114514) + np_arr = np.random.normal(0, 1, [2, 3, 4, 5]) + return paddle.to_tensor(np_arr), paddle.to_tensor(np_arr) + + in1, in2 = get_tensors() + in1.stop_gradient = False + in2.stop_gradient = False + + def computation_graph(in_tensor): + y = in_tensor * 2.3 + 3.0 + y = paddle.maximum(y, paddle.to_tensor([0], dtype=paddle.float32)) + return y.mean(axis=0) + + out1 = computation_graph(in1) + out2 = computation_graph(in2) + + packs1 = paddle.compat.split(out1, 2, dim=2) + packs2 = paddle.split(out2, [2, 2, 1], axis=2) + + res1 = packs1[0] + packs1[1] + packs1[2] + res2 = packs2[0] + packs2[1] + packs2[2] + res1.backward() + res2.backward() + np.testing.assert_allclose(in1.grad.numpy(), in2.grad.numpy()) + + def test_empty_dim(self): + """Split with empty dim""" + in_tensor = paddle.arange(72, dtype=paddle.int64).reshape([3, 12, 2]) + self._compare_with_origin(in_tensor, [5, 0, 7], axis=1) + + def test_split_with_one_block(self): + """Resulting tuple should be of length 1""" + in_tensor = paddle.arange(60, dtype=paddle.float32).reshape([3, 4, 5]) + self._compare_with_origin(in_tensor, 5, paddle.to_tensor([-1])) + self._compare_with_origin(in_tensor, [5], paddle.to_tensor(2)) + + def test_edge_cases(self): + """Test edge cases and error handling""" + x = paddle.arange(5) + s1, s2 = split(x, [3, 2]) + np.testing.assert_allclose(s1.numpy(), [0, 1, 2]) + np.testing.assert_allclose(s2.numpy(), [3, 4]) + + x = paddle.rand([2, 2, 2]) + a, b = split(x, 1, 2) + self.assertEqual(a.shape, [2, 2, 1]) + + # invalid split sections + with self.assertRaises(ValueError): + split(x, [3, 1], 1) + + # invalid split axis + with self.assertRaises(ValueError): + split(x, 2, 3) + + def test_error_hint(self): + """Test whether there will be correct exception when users pass paddle.split kwargs in paddle.compat.split, vice versa.""" + x = paddle.randn([3, 9, 5]) + + msg_gt_1 = ( + "paddle.split() received unexpected keyword arguments 'tensor', 'split_size_or_sections', 'dim'. " + "\nDid you mean to use paddle.compat.split() instead?" + ) + msg_gt_2 = ( + "paddle.compat.split() received unexpected keyword argument 'num_or_sections'. " + "\nDid you mean to use paddle.split() instead?" + ) + msg_gt_3 = "(InvalidArgument) The dim is expected to be in range of [-3, 3), but got 3" + msg_gt_4 = "paddle.compat.split expects split_sizes have only non-negative entries, but got size = -5 on dim 2" + + split_size = paddle.to_tensor([3]) + msg_gt_5 = ( + "The type of 'split_size_or_sections' in split must be int, list or tuple in imperative mode, but " + f"received {type(split_size)}." + ) + + with self.assertRaises(TypeError) as cm: + tensors = paddle.split(tensor=x, split_size_or_sections=3, dim=0) + self.assertEqual(str(cm.exception), msg_gt_1) + + with self.assertRaises(TypeError) as cm: + tensors = split(x, num_or_sections=3, dim=0) + self.assertEqual(str(cm.exception), msg_gt_2) + + with self.assertRaises(ValueError) as cm: + tensors = split(x, 3, dim=3) + self.assertEqual(str(cm.exception), msg_gt_3) + + with self.assertRaises(ValueError) as cm: + tensors = split(x, [3, 3, -5], -2) + self.assertEqual(str(cm.exception), msg_gt_4) + + with self.assertRaises(TypeError) as cm: + tensors = split(x, split_size, 1) + self.assertEqual(str(cm.exception), msg_gt_5) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/legacy_test/test_compat_split_static.py b/test/legacy_test/test_compat_split_static.py new file mode 100644 index 00000000000000..006e3ec30ea077 --- /dev/null +++ b/test/legacy_test/test_compat_split_static.py @@ -0,0 +1,184 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle.compat import split + + +class TestCompatSplitStatic(unittest.TestCase): + def _compare_with_origin_static( + self, input_shape, size, axis=0, dim_rank=-1 + ): + """size_dim: -1 means we input size by int, 0 means 0-size tensor, 1 means tensor with shape [1]""" + numel = 1 + for v in input_shape: + numel *= v + input_axis = axis + if dim_rank == 0: + input_axis = paddle.to_tensor(axis) + elif dim_rank == 1: + input_axis = paddle.to_tensor([axis]) + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + input_tensor = paddle.arange(numel, dtype=paddle.float32).reshape( + input_shape + ) + pd_results = split(input_tensor, size, dim=input_axis) + + if isinstance(size, int): + shape_on_axis = input_tensor.shape[axis] + remaining_num = shape_on_axis % size + num_sections = shape_on_axis // size + if remaining_num == 0: + size = num_sections + else: + size = [size for _ in range(num_sections)] + size.append(remaining_num) + + origin_results = paddle.split( + input_tensor, num_or_sections=size, axis=axis + ) + assert len(pd_results) == len(origin_results), "length mismatched" + place = ( + paddle.CUDAPlace(0) + if paddle.is_compiled_with_cuda() + else paddle.CPUPlace() + ) + exe = paddle.static.Executor(place) + results = exe.run(fetch_list=[*origin_results, *pd_results]) + length_needed = len(results) // 2 + for i in range(length_needed): + np.testing.assert_allclose( + results[i], results[i + length_needed] + ) + paddle.disable_static() + + def test_split_composite_static(self): + paddle.seed(114514) + + def get_tensors(): + np.random.seed(114514) + np_arr = np.random.normal(0, 1, [2, 3, 4, 5]) + return paddle.to_tensor(np_arr), paddle.to_tensor(np_arr) + + in1, in2 = get_tensors() + in1.stop_gradient = False + in2.stop_gradient = False + + @paddle.jit.to_static + def computation_graph(in1: paddle.Tensor, in2: paddle.Tensor): + y1 = in1 * 1.5 + 1.0 + y1 = paddle.minimum(y1, paddle.to_tensor([0], dtype=paddle.float32)) + out1 = y1.mean(axis=0) + + y2 = in2 * 1.5 + 1.0 + y2 = paddle.minimum(y2, paddle.to_tensor([0], dtype=paddle.float32)) + out2 = y2.mean(axis=0) + + packs1 = paddle.compat.split(out1, 2, dim=2) + packs2 = paddle.split(out2, [2, 2, 1], axis=2) + + res1 = packs1[0] + packs1[1] + packs1[2] + res2 = packs2[0] + packs2[1] + packs2[2] + + return res1, res2 + + res1, res2 = computation_graph(in1, in2) + np.testing.assert_allclose(res1.numpy(), res2.numpy()) + + def test_static_graph(self): + """Test static graph execution""" + # fixed random seed for reproducibility + np.random.seed(114514) + # old static graph mode + paddle.enable_static() + + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data(name='x', shape=[None, 6], dtype='float32') + result0, result1 = split(x, split_size_or_sections=[3, 3], dim=1) + output = result0 * 2.0 + paddle.sin(result1) + + place = ( + paddle.CUDAPlace(0) + if paddle.is_compiled_with_cuda() + else paddle.CPUPlace() + ) + exe = paddle.static.Executor(place) + + input_data = np.random.rand(3, 6).astype('float32') + feed = {'x': input_data} + + results = exe.run(feed=feed, fetch_list=[result0, result1, output]) + + pd_result0, pd_result1 = results[0], results[1] + np.testing.assert_allclose(input_data[:, :3], pd_result0) + np.testing.assert_allclose(input_data[:, 3:], pd_result1) + + expected_output = input_data[:, :3] * 2.0 + np.sin( + input_data[:, 3:] + ) + np.testing.assert_allclose( + expected_output, results[2], rtol=1e-4, atol=1e-4 + ) + + paddle.disable_static() + + def test_error_hint(self): + """Test whether there will be correct exception when users pass paddle.split kwargs in paddle.compat.split, vice versa.""" + + msg_gt_1 = "split_size_or_sections must be greater than 0." + msg_gt_2 = "len(split_size_or_sections) must not be more than input.shape[dim]." + msg_gt_3 = "The type of 'split_size_or_sections' in split must be int, list or tuple in imperative mode." + msg_gt_4 = ( + "'dim' is not allowed to be a pir.Value in a static graph: " + "\npir.Value can not be used for indexing python lists/tuples." + ) + + paddle.enable_static() + with self.assertRaises(AssertionError) as cm: + x = paddle.randn([3, 4, 5]) + tensors = split(x, -2, dim=0) + self.assertEqual(str(cm.exception), msg_gt_1) + + with self.assertRaises(AssertionError) as cm: + x = paddle.randn([3, 4, 5]) + tensors = split(x, (1, 1, 1, 1, 2, 2), dim=-1) + self.assertEqual(str(cm.exception), msg_gt_2) + + with self.assertRaises(TypeError) as cm: + x = paddle.randn([3, 4, 5]) + tensors = split(x, paddle.to_tensor(2), dim=2) + self.assertEqual(str(cm.exception), msg_gt_3) + + with self.assertRaises(TypeError) as cm: + x = paddle.randn([3, 4, 5]) + tensors = split(x, 2, dim=paddle.to_tensor(2)) + paddle.disable_static() + self.assertEqual(str(cm.exception), msg_gt_4) + + def test_basic_split(self): + """Test basic splitting with integer size""" + input_shape = [3, 6] + self._compare_with_origin_static(input_shape, 1, 0) + self._compare_with_origin_static(input_shape, 3, -1) + self._compare_with_origin_static(input_shape, 4, dim_rank=0) + self._compare_with_origin_static(input_shape, 3, dim_rank=1) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/legacy_test/test_complex_op.py b/test/legacy_test/test_complex_op.py index d5ba26549d5a40..d0df015677f6b0 100644 --- a/test/legacy_test/test_complex_op.py +++ b/test/legacy_test/test_complex_op.py @@ -19,7 +19,7 @@ import paddle from paddle import static -from paddle.base import dygraph +from paddle.base import core, dygraph paddle.enable_static() @@ -134,6 +134,7 @@ def test_dygraph(self): np.testing.assert_allclose(self.out, out_np, rtol=1e-05) def test_static(self): + paddle.enable_static() mp, sp = static.Program(), static.Program() with static.program_guard(mp, sp): x = static.data("x", shape=[10, 10], dtype="float64") @@ -148,5 +149,116 @@ def test_static(self): np.testing.assert_allclose(self.out, out_np, rtol=1e-05) +class OutTest(unittest.TestCase): + def setUp(self): + paddle.disable_static() + if core.is_compiled_with_cuda(): + self.place = core.CUDAPlace(0) + else: + self.place = core.CPUPlace() + + def test_complex_api(self): + def run_complex(test_type): + x = paddle.arange(2, dtype=paddle.float32).unsqueeze(-1) + y = paddle.arange(3, dtype=paddle.float32) + x.stop_gradient = False + y.stop_gradient = False + z = paddle.ones([100]) + z.stop_gradient = False + + a = x + x + b = y + y + c = z + z + + if test_type == "return": + c = paddle.complex(a, b) + elif test_type == "input_out": + paddle.complex(a, b, out=c) + elif test_type == "both_return": + c = paddle.complex(a, b, out=c) + elif test_type == "both_input_out": + tmp = paddle.complex(a, b, out=c) + + out = paddle._C_ops.complex(a, b) + np.testing.assert_allclose( + out.numpy(), + c.numpy(), + 1e-20, + 1e-20, + ) + + d = c + c + + d.mean().backward() + + return c, x.grad, y.grad, z.grad + + paddle.disable_static() + out1, x1, y1, z1 = run_complex("return") + out2, x2, y2, z2 = run_complex("input_out") + out3, x3, y3, z3 = run_complex("both_return") + out4, x4, y4, z4 = run_complex("both_input_out") + + np.testing.assert_allclose( + out1.numpy(), + out2.numpy(), + 1e-20, + 1e-20, + ) + np.testing.assert_allclose( + out1.numpy(), + out3.numpy(), + 1e-20, + 1e-20, + ) + np.testing.assert_allclose( + out1.numpy(), + out4.numpy(), + 1e-20, + 1e-20, + ) + + np.testing.assert_allclose( + x1.numpy(), + x2.numpy(), + 1e-20, + 1e-20, + ) + np.testing.assert_allclose( + x1.numpy(), + x3.numpy(), + 1e-20, + 1e-20, + ) + np.testing.assert_allclose( + x1.numpy(), + x3.numpy(), + 1e-20, + 1e-20, + ) + np.testing.assert_allclose( + y1.numpy(), + y2.numpy(), + 1e-20, + 1e-20, + ) + np.testing.assert_allclose( + y1.numpy(), + y3.numpy(), + 1e-20, + 1e-20, + ) + np.testing.assert_allclose( + y1.numpy(), + y4.numpy(), + 1e-20, + 1e-20, + ) + np.testing.assert_equal(z1, None) + np.testing.assert_equal(z2, None) + np.testing.assert_equal(z3, None) + np.testing.assert_equal(z4, None) + + if __name__ == "__main__": unittest.main() diff --git a/test/legacy_test/test_conv2d_op.py b/test/legacy_test/test_conv2d_op.py index 1186f1c2720115..4ee915872aa85a 100644 --- a/test/legacy_test/test_conv2d_op.py +++ b/test/legacy_test/test_conv2d_op.py @@ -483,7 +483,7 @@ def setUp(self): 'groups': self.groups, 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'data_format': self.data_format, 'fuse_relu_before_depthwise_conv': self.fuse_relu_before_depthwise_conv, 'exhaustive_search': self.exhaustive_search, @@ -817,7 +817,7 @@ def setUp(self): 'groups': self.groups, 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'data_format': self.data_format, 'fuse_relu_before_depthwise_conv': self.fuse_relu_before_depthwise_conv, 'exhaustive_search': self.exhaustive_search, diff --git a/test/legacy_test/test_conv2d_transpose_op.py b/test/legacy_test/test_conv2d_transpose_op.py index 1dbfeda253f482..f62e3b5277da6a 100644 --- a/test/legacy_test/test_conv2d_transpose_op.py +++ b/test/legacy_test/test_conv2d_transpose_op.py @@ -210,7 +210,7 @@ def setUp(self): 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, 'is_test': self.is_test, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'data_format': self.data_format, } if self.output_size is not None: diff --git a/test/legacy_test/test_conv3d_op.py b/test/legacy_test/test_conv3d_op.py index 65cc6c0c26431b..63c003118219f8 100644 --- a/test/legacy_test/test_conv3d_op.py +++ b/test/legacy_test/test_conv3d_op.py @@ -444,7 +444,7 @@ def setUp(self): 'groups': self.groups, 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'data_format': self.data_format, } self.outputs = {'Output': output} @@ -804,7 +804,7 @@ def setUp(self): 'groups': self.groups, 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'data_format': self.data_format, } self.outputs = {'Output': output} diff --git a/test/legacy_test/test_elementwise_add_op.py b/test/legacy_test/test_elementwise_add_op.py index 4d8d2d2815d942..e0000e7d6aa992 100644 --- a/test/legacy_test/test_elementwise_add_op.py +++ b/test/legacy_test/test_elementwise_add_op.py @@ -47,7 +47,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} self.outputs = {'Out': self.out} def check_dygraph(self): @@ -244,7 +244,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(convert_float_to_uint16(self.x)), 'Y': OpTest.np_dtype_to_base_dtype(convert_float_to_uint16(self.y)), } - self.attrs = {'axis': self.axis, 'use_mkldnn': False} + self.attrs = {'axis': self.axis, 'use_onednn': False} self.outputs = {'Out': convert_float_to_uint16(self.out)} self.if_enable_cinn() @@ -827,7 +827,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': -1, 'use_mkldnn': False} + self.attrs = {'axis': -1, 'use_onednn': False} self.outputs = {'Out': self.out} def init_base_dtype(self): @@ -968,7 +968,7 @@ def test_warnings(self): type="elementwise_add", inputs={'X': data, 'Y': data}, outputs={'Out': out}, - attrs={'axis': 1, 'use_mkldnn': False}, + attrs={'axis': 1, 'use_onednn': False}, ) self.assertTrue( "op elementwise_add's attr axis = 1 is not the default value: -1" @@ -1042,7 +1042,7 @@ def setUp(self): 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} self.outputs = {'Out': self.out} def check_dygraph(self): diff --git a/test/legacy_test/test_elementwise_div_op.py b/test/legacy_test/test_elementwise_div_op.py index 0ff6dd4a26bac8..e6502ebef6146b 100644 --- a/test/legacy_test/test_elementwise_div_op.py +++ b/test/legacy_test/test_elementwise_div_op.py @@ -589,7 +589,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': -1, 'use_mkldnn': False} + self.attrs = {'axis': -1, 'use_onednn': False} self.outputs = {'Out': self.out} def init_base_dtype(self): diff --git a/test/legacy_test/test_elementwise_floordiv_op.py b/test/legacy_test/test_elementwise_floordiv_op.py index 1a8266f27beb75..186592c609e56a 100644 --- a/test/legacy_test/test_elementwise_floordiv_op.py +++ b/test/legacy_test/test_elementwise_floordiv_op.py @@ -43,7 +43,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} self.outputs = {'Out': self.out} def test_check_output(self): diff --git a/test/legacy_test/test_elementwise_mod_op.py b/test/legacy_test/test_elementwise_mod_op.py index 3620215c186114..618643229d73ec 100644 --- a/test/legacy_test/test_elementwise_mod_op.py +++ b/test/legacy_test/test_elementwise_mod_op.py @@ -46,7 +46,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} self.outputs = {'Out': self.out} def test_check_output(self): @@ -195,7 +195,7 @@ def setUp(self): 'X': convert_float_to_uint16(OpTest.np_dtype_to_base_dtype(self.x)), 'Y': convert_float_to_uint16(OpTest.np_dtype_to_base_dtype(self.y)), } - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} self.outputs = {'Out': convert_float_to_uint16(self.out)} def test_check_output(self): diff --git a/test/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py index a4f365ea92b1a8..8c6fbc679213af 100644 --- a/test/legacy_test/test_elementwise_mul_op.py +++ b/test/legacy_test/test_elementwise_mul_op.py @@ -49,7 +49,7 @@ def setUp(self): 'Y': OpTest.np_dtype_to_base_dtype(self.y), } self.outputs = {'Out': self.out} - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} def test_check_output(self): # TODO(wangzhongpu): support onednn op in dygraph mode @@ -242,7 +242,7 @@ def setUp(self): 'Y': OpTest.np_dtype_to_base_dtype(convert_float_to_uint16(self.y)), } self.outputs = {'Out': convert_float_to_uint16(self.out)} - self.attrs = {'axis': self.axis, 'use_mkldnn': False} + self.attrs = {'axis': self.axis, 'use_onednn': False} self.if_enable_cinn() def test_check_output(self): @@ -381,7 +381,7 @@ def init_input_attr_output(self): 'Y': OpTest.np_dtype_to_base_dtype(self.y), } self.outputs = {'Out': self.out} - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} def init_dtype(self): self.dtype = np.float64 @@ -406,7 +406,7 @@ def init_input_attr_output(self): 'Y': OpTest.np_dtype_to_base_dtype(self.y), } self.outputs = {'Out': self.out} - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} def init_axis(self): self.axis = 0 @@ -592,7 +592,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': -1, 'use_mkldnn': False} + self.attrs = {'axis': -1, 'use_onednn': False} self.outputs = {'Out': self.out} def init_base_dtype(self): diff --git a/test/legacy_test/test_elementwise_sub_op.py b/test/legacy_test/test_elementwise_sub_op.py index 28e336539f868d..736f1b33d7f7c5 100644 --- a/test/legacy_test/test_elementwise_sub_op.py +++ b/test/legacy_test/test_elementwise_sub_op.py @@ -859,7 +859,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': -1, 'use_mkldnn': False} + self.attrs = {'axis': -1, 'use_onednn': False} self.outputs = {'Out': self.out} self.if_check_prim() self.if_enable_cinn() @@ -1207,7 +1207,7 @@ def test_warnings(self): type="elementwise_sub", inputs={'X': data, 'Y': data}, outputs={'Out': out}, - attrs={'axis': 1, 'use_mkldnn': False}, + attrs={'axis': 1, 'use_onednn': False}, ) self.assertTrue( "op elementwise_sub's attr axis = 1 is not the default value: -1" diff --git a/test/legacy_test/test_expand_v2_op.py b/test/legacy_test/test_expand_v2_op.py index 5e7ca4b28f92ef..ccf32a49665cbf 100644 --- a/test/legacy_test/test_expand_v2_op.py +++ b/test/legacy_test/test_expand_v2_op.py @@ -764,7 +764,7 @@ def setUp(self): self.init_place() self.python_api = paddle.expand self.x = np.zeros(self.ori_shape).astype("float32") - self.attrs = {'shape': self.shape, 'use_mkldnn': True} + self.attrs = {'shape': self.shape, 'use_onednn': True} self.use_onednn = True self.set_inputs() self.set_additional_inputs() diff --git a/test/legacy_test/test_fc_op.py b/test/legacy_test/test_fc_op.py index d61c93361097b7..a740ce0c49c304 100644 --- a/test/legacy_test/test_fc_op.py +++ b/test/legacy_test/test_fc_op.py @@ -73,7 +73,7 @@ def setUp(self): activation_type = "relu" else: activation_type = "" - self.attrs = {'use_mkldnn': False, 'activation_type': activation_type} + self.attrs = {'use_onednn': False, 'activation_type': activation_type} self.outputs = { 'Out': fc_refer(self.matrix, self.with_bias, self.with_relu) diff --git a/test/legacy_test/test_fused_transpose_split_quant_op.py b/test/legacy_test/test_fused_transpose_split_quant_op.py index edfea14fc1f35d..6c8604ba2ea876 100644 --- a/test/legacy_test/test_fused_transpose_split_quant_op.py +++ b/test/legacy_test/test_fused_transpose_split_quant_op.py @@ -17,8 +17,20 @@ import paddle -def fused_transpose_split_quant_ref(x, tokens_per_expert, pow_2_scales): +def dequant_ref( + fp8_tensor: paddle.Tensor, scale: paddle.Tensor, block_size: int = 128 +) -> paddle.Tensor: + """Helper function to dequantize fp8 tensor to bf16""" + expanded_scale = paddle.repeat_interleave(scale, repeats=128, axis=-1) + # Handle non-aligned cases by truncating + expanded_scale = expanded_scale[:, : fp8_tensor.shape[-1]] + return (fp8_tensor.astype('float32') * expanded_scale).astype('bfloat16') + + +def fused_transpose_split_quant_ref(x, xscale, tokens_per_expert, pow_2_scales): shape = x.shape + if x.dtype == paddle.float8_e4m3fn: + x = dequant_ref(x, xscale) x = x.reshape([shape[0] // 128, 128, shape[1]]) amax = x.astype('float32').abs().max(axis=1) @@ -37,43 +49,76 @@ def fused_transpose_split_quant_ref(x, tokens_per_expert, pow_2_scales): return out, scale -def test_fused_transpose_split_quant(tokens_per_expert, seq_len, pow_2_scales): +def test_fused_transpose_split_quant( + tokens_per_expert, seq_len, pow_2_scales, using_fp8=False +): x = paddle.randn([sum(tokens_per_expert), seq_len], dtype='bfloat16') - x = paddle.clip(x, min=-50, max=50) + if using_fp8: + x = x.cast('float8_e4m3fn') + xscale = ( + paddle.randn( + [sum(tokens_per_expert), (seq_len + 127) // 128], dtype='float32' + ) + if using_fp8 + else None + ) + # x = paddle.clip(x, min=-50, max=50) out, scale = paddle.incubate.nn.functional.fused_transpose_split_quant( - x, tokens_per_expert, pow_2_scales + x, xscale, tokens_per_expert, pow_2_scales ) out_ref, scale_ref = fused_transpose_split_quant_ref( - x, tokens_per_expert, pow_2_scales + x, xscale, tokens_per_expert, pow_2_scales ) for t, t_ref in zip(out, out_ref): - np.testing.assert_allclose(t.astype('float32'), t_ref.astype('float32')) + try: + np.testing.assert_allclose( + t.astype('float32'), t_ref.astype('float32') + ) + except AssertionError as e: + print("AssertionError", e) for t, t_ref in zip(scale, scale_ref): - np.testing.assert_allclose(t, t_ref) + try: + np.testing.assert_allclose(t, t_ref) + except AssertionError as e: + print("AssertionError", e) def run(): - test_fused_transpose_split_quant([0, 0], 1024, False) - test_fused_transpose_split_quant([128, 2 * 128], 0, True) - test_fused_transpose_split_quant([128], 1, False) - test_fused_transpose_split_quant([0, 128, 0, 2 * 128], 127, True) - test_fused_transpose_split_quant([3 * 128, 4 * 128, 5 * 128], 233, False) - test_fused_transpose_split_quant( - [24 * 128, 128, 50 * 128, 16 * 128], 2162, True - ) - test_fused_transpose_split_quant( - [7 * 128, 29 * 128, 3 * 128, 128 * 128, 13 * 128], 4000, False - ) - test_fused_transpose_split_quant( - [18 * 128, 5 * 128, 24 * 128, 128, 6 * 128, 0, 27 * 128, 7 * 128], - 7168, - True, - ) + fp8_choice = [True, False] + for using_fp8 in fp8_choice: + test_fused_transpose_split_quant( + [0, 0], 1024, False, using_fp8=using_fp8 + ) + test_fused_transpose_split_quant( + [128, 2 * 128], 0, True, using_fp8=using_fp8 + ) + test_fused_transpose_split_quant([128], 1, False, using_fp8=using_fp8) + test_fused_transpose_split_quant( + [0, 128, 0, 2 * 128], 127, True, using_fp8=using_fp8 + ) + test_fused_transpose_split_quant( + [3 * 128, 4 * 128, 5 * 128], 233, False, using_fp8=using_fp8 + ) + test_fused_transpose_split_quant( + [24 * 128, 128, 50 * 128, 16 * 128], 2162, True, using_fp8=using_fp8 + ) + test_fused_transpose_split_quant( + [7 * 128, 29 * 128, 3 * 128, 128 * 128, 13 * 128], + 4000, + False, + using_fp8=using_fp8, + ) + test_fused_transpose_split_quant( + [18 * 128, 5 * 128, 24 * 128, 128, 6 * 128, 0, 27 * 128, 7 * 128], + 7168, + True, + using_fp8=using_fp8, + ) if __name__ == '__main__': diff --git a/test/legacy_test/test_fusion_gru_op.py b/test/legacy_test/test_fusion_gru_op.py index 950142835e6524..80f2bd185876b5 100644 --- a/test/legacy_test/test_fusion_gru_op.py +++ b/test/legacy_test/test_fusion_gru_op.py @@ -111,7 +111,7 @@ def setUp(self): 'gate_activation': self.act_gate, 'is_reverse': self.is_reverse, 'origin_mode': self.origin_mode, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } def test_check_output(self): diff --git a/test/legacy_test/test_gaussian_random_op.py b/test/legacy_test/test_gaussian_random_op.py index c4f860bcc7e973..36b8453b097865 100644 --- a/test/legacy_test/test_gaussian_random_op.py +++ b/test/legacy_test/test_gaussian_random_op.py @@ -40,7 +40,7 @@ def setUp(self): "mean": self.mean, "std": self.std, "seed": 10, - "use_mkldnn": self.use_onednn, + "use_onednn": self.use_onednn, } paddle.seed(10) @@ -82,7 +82,7 @@ def setUp(self): "std": self.std, "seed": 10, "dtype": paddle.float16, - "use_mkldnn": self.use_onednn, + "use_onednn": self.use_onednn, } paddle.seed(10) @@ -134,7 +134,7 @@ def setUp(self): "std": self.std, "seed": 10, "dtype": paddle.bfloat16, - "use_mkldnn": self.use_onednn, + "use_onednn": self.use_onednn, } paddle.seed(10) @@ -184,7 +184,7 @@ def setUp(self): 'mean': self.mean, 'std': self.std, 'seed': self.seed, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.inputs = {"ShapeTensorList": shape_tensor_list} @@ -251,7 +251,7 @@ def setUp(self): 'mean': self.mean, 'std': self.std, 'seed': self.seed, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.outputs = {'Out': np.zeros((123, 92), dtype='float32')} diff --git a/test/legacy_test/test_kron_op.py b/test/legacy_test/test_kron_op.py index 05ff4b6dd777a4..7f634707a352f9 100644 --- a/test/legacy_test/test_kron_op.py +++ b/test/legacy_test/test_kron_op.py @@ -272,7 +272,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': -1, 'use_mkldnn': False} + self.attrs = {'axis': -1, 'use_onednn': False} self.outputs = {'Out': self.out} def init_base_dtype(self): diff --git a/test/legacy_test/test_matmul_v2_op.py b/test/legacy_test/test_matmul_v2_op.py index 5e1938dc704141..16bce228f637b5 100644 --- a/test/legacy_test/test_matmul_v2_op.py +++ b/test/legacy_test/test_matmul_v2_op.py @@ -713,7 +713,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': -1, 'use_mkldnn': False} + self.attrs = {'axis': -1, 'use_onednn': False} self.outputs = {'Out': self.out} def init_base_dtype(self): @@ -766,7 +766,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': -1, 'use_mkldnn': False} + self.attrs = {'axis': -1, 'use_onednn': False} self.outputs = {'Out': self.out} def init_base_dtype(self): @@ -828,7 +828,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': -1, 'use_mkldnn': False} + self.attrs = {'axis': -1, 'use_onednn': False} self.outputs = {'Out': self.out} def init_base_dtype(self): @@ -854,7 +854,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': -1, 'use_mkldnn': False} + self.attrs = {'axis': -1, 'use_onednn': False} self.outputs = {'Out': self.out} def init_base_dtype(self): @@ -880,7 +880,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': -1, 'use_mkldnn': False} + self.attrs = {'axis': -1, 'use_onednn': False} self.outputs = {'Out': self.out} def init_base_dtype(self): @@ -906,7 +906,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': -1, 'use_mkldnn': False} + self.attrs = {'axis': -1, 'use_onednn': False} self.outputs = {'Out': self.out} def init_base_dtype(self): @@ -950,7 +950,7 @@ def setUp(self): 'Y': OpTest.np_dtype_to_base_dtype(self.y), } self.out = np.matmul(self.x, self.y) - self.attrs = {'axis': -1, 'use_mkldnn': False} + self.attrs = {'axis': -1, 'use_onednn': False} self.outputs = {'Out': self.out} def init_input_output(self): diff --git a/test/legacy_test/test_ones_op.py b/test/legacy_test/test_ones_op.py index 3394bc611e7bfe..63ea2930633414 100644 --- a/test/legacy_test/test_ones_op.py +++ b/test/legacy_test/test_ones_op.py @@ -20,38 +20,121 @@ class ApiOnesTest(unittest.TestCase): - def test_paddle_ones(self): + def test_static_ones(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + ones = paddle.ones(10, dtype=paddle.float32) + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + (result,) = exe.run(fetch_list=[ones]) + expect = np.ones([10], dtype="float32") + np.testing.assert_equal(result, expect) + + with paddle.static.program_guard(paddle.static.Program()): + ones = paddle.ones(10, 2, 3, dtype=paddle.float32) + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + (result,) = exe.run(fetch_list=[ones]) + expect = np.ones([10, 2, 3], dtype="float32") + np.testing.assert_equal(result, expect) + + with paddle.static.program_guard(paddle.static.Program()): + ones = paddle.ones([10, 2, 3], dtype=paddle.float32) + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + (result,) = exe.run(fetch_list=[ones]) + expect = np.ones([10, 2, 3], dtype="float32") + np.testing.assert_equal(result, expect) + + with paddle.static.program_guard(paddle.static.Program()): + ones = paddle.ones(size=[10, 2, 3], dtype=paddle.float32) + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + (result,) = exe.run(fetch_list=[ones]) + expect = np.ones([10, 2, 3], dtype="float32") + np.testing.assert_equal(result, expect) + + with paddle.static.program_guard(paddle.static.Program()): + ones = paddle.ones([10, 2, 3], paddle.float32) + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + (result,) = exe.run(fetch_list=[ones]) + expect = np.ones([10, 2, 3], dtype="float32") + np.testing.assert_equal(result, expect) + + with paddle.static.program_guard(paddle.static.Program()): + ones = paddle.ones([10, 2, 3], paddle.float32) + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + (result,) = exe.run(fetch_list=[ones]) + expect = np.ones([10, 2, 3], dtype="float32") + np.testing.assert_equal(result, expect) + + with paddle.static.program_guard(paddle.static.Program()): + ones = paddle.ones(shape=[10, 2, 3], dtype=paddle.float32) + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + (result,) = exe.run(fetch_list=[ones]) + expect = np.ones([10, 2, 3], dtype="float32") + np.testing.assert_equal(result, expect) + with paddle.static.program_guard(paddle.static.Program()): ones = paddle.ones(shape=[10]) place = paddle.CPUPlace() exe = paddle.static.Executor(place) (result,) = exe.run(fetch_list=[ones]) - expected_result = np.ones(10, dtype="float32") - self.assertEqual((result == expected_result).all(), True) + expect = np.ones(10, dtype="float32") + np.testing.assert_equal(result, expect) with paddle.static.program_guard(paddle.static.Program()): ones = paddle.ones(shape=[10], dtype="float64") place = paddle.CPUPlace() exe = paddle.static.Executor(place) (result,) = exe.run(fetch_list=[ones]) - expected_result = np.ones(10, dtype="float64") - self.assertEqual((result == expected_result).all(), True) + expect = np.ones(10, dtype="float64") + np.testing.assert_equal(result, expect) with paddle.static.program_guard(paddle.static.Program()): ones = paddle.ones(shape=[10], dtype="int64") place = paddle.CPUPlace() exe = paddle.static.Executor(place) (result,) = exe.run(fetch_list=[ones]) - expected_result = np.ones(10, dtype="int64") - self.assertEqual((result == expected_result).all(), True) + expect = np.ones(10, dtype="int64") + np.testing.assert_equal(result, expect) with paddle.static.program_guard(paddle.static.Program()): ones = paddle.ones(shape=10, dtype="int64") place = paddle.CPUPlace() exe = paddle.static.Executor(place) (result,) = exe.run(fetch_list=[ones]) - expected_result = np.ones(10, dtype="int64") - self.assertEqual((result == expected_result).all(), True) + expect = np.ones(10, dtype="int64") + np.testing.assert_equal(result, expect) + paddle.disable_static() + + def test_dygraph_ones(self): + paddle.disable_static() + result = paddle.ones(10, dtype=paddle.float32) + expect = np.ones([10], dtype="float32") + np.testing.assert_equal(result, expect) + + result = paddle.ones(10, 2, 3, dtype=paddle.float32) + expect = np.ones([10, 2, 3], dtype="float32") + np.testing.assert_equal(result, expect) + + result = paddle.ones([10, 2, 3], dtype=paddle.float32) + np.testing.assert_equal(result, expect) + + result = paddle.ones(size=[10, 2, 3], dtype=paddle.float32) + np.testing.assert_equal(result, expect) + + result = paddle.ones([10, 2, 3], paddle.float32) + np.testing.assert_equal(result, expect) + + result = paddle.ones([10, 2, 3], "float32") + np.testing.assert_equal(result, expect) + + result = paddle.ones(shape=[10, 2, 3], dtype=paddle.float32) + np.testing.assert_equal(result, expect) if __name__ == "__main__": diff --git a/test/legacy_test/test_pool2d_op.py b/test/legacy_test/test_pool2d_op.py index 3c38b4a1ec9381..b2eea65d3caef0 100644 --- a/test/legacy_test/test_pool2d_op.py +++ b/test/legacy_test/test_pool2d_op.py @@ -451,7 +451,7 @@ def setUp(self): 'pooling_type': self.pool_type, 'global_pooling': self.global_pool, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'ceil_mode': self.ceil_mode, 'data_format': self.data_format, 'exclusive': self.exclusive, diff --git a/test/legacy_test/test_reshape_op.py b/test/legacy_test/test_reshape_op.py index 06ff5633ba4b07..011ae2a55606d5 100755 --- a/test/legacy_test/test_reshape_op.py +++ b/test/legacy_test/test_reshape_op.py @@ -406,7 +406,7 @@ def setUp(self): self.inputs = {'X': OpTest.np_dtype_to_base_dtype(input)} self.attrs = { 'shape': self.new_shape, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.outputs = { "Out": self.inputs["X"].reshape(self.inferred_shape), diff --git a/test/legacy_test/test_sgd_op_bf16.py b/test/legacy_test/test_sgd_op_bf16.py index 25bacbbecf0aff..4cefc0c97df638 100644 --- a/test/legacy_test/test_sgd_op_bf16.py +++ b/test/legacy_test/test_sgd_op_bf16.py @@ -49,7 +49,7 @@ def setUp(self): self.inputs = {'Param': w_bf16, 'Grad': g_bf16, 'LearningRate': lr_bf16} self.outputs = {'ParamOut': w - lr * g} - self.attrs = {'use_mkldnn': self.use_onednn} + self.attrs = {'use_onednn': self.use_onednn} def conf(self): self.h = 102 @@ -157,7 +157,7 @@ def test_sparse_grad_sgd(self): Grad='Grad', ParamOut='Param', LearningRate='LearningRate', - use_mkldnn=True, + use_onednn=True, ) sgd_op.run(scope, place) @@ -215,7 +215,7 @@ def test_sparse_param_grad_sgd(self): Grad='Grad', ParamOut='Param', LearningRate='LearningRate', - use_mkldnn=True, + use_onednn=True, ) sgd_op.run(scope, place) diff --git a/test/legacy_test/test_slice_op.py b/test/legacy_test/test_slice_op.py index 88e8b802c5a704..a75b4192ac986a 100644 --- a/test/legacy_test/test_slice_op.py +++ b/test/legacy_test/test_slice_op.py @@ -160,7 +160,7 @@ def setUp(self): 'starts': self.starts, 'ends': self.ends, 'infer_flags': self.infer_flags, - 'use_mkldnn': True, + 'use_onednn': True, } def config(self): diff --git a/test/legacy_test/test_softmax_op.py b/test/legacy_test/test_softmax_op.py index d44102567b6c84..1b9ce32daac00c 100644 --- a/test/legacy_test/test_softmax_op.py +++ b/test/legacy_test/test_softmax_op.py @@ -78,7 +78,7 @@ def setUp(self): self.attrs = { 'axis': self.axis, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.enable_cinn = True @@ -161,7 +161,7 @@ def setUp(self): self.attrs = { 'axis': -1, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.enable_cinn = False @@ -210,7 +210,7 @@ def setUp(self): self.attrs = { 'axis': -1, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.enable_cinn = False @@ -487,7 +487,7 @@ def setUp(self): self.attrs = { 'axis': self.axis, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } def init_cudnn(self): diff --git a/test/legacy_test/test_sum_op.py b/test/legacy_test/test_sum_op.py index 0746cc46d022a9..f310d4400e2847 100644 --- a/test/legacy_test/test_sum_op.py +++ b/test/legacy_test/test_sum_op.py @@ -37,7 +37,7 @@ from paddle.framework import in_pir_mode -def sum_wrapper(X, use_mkldnn=False): +def sum_wrapper(X, use_onednn=False): res = paddle.full(shape=X[0].shape, fill_value=0.0, dtype=X[0].dtype) for x in X: res = paddle.add(res, x) @@ -59,7 +59,7 @@ def setUp(self): self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]} y = x0 + x1 + x2 self.outputs = {'Out': y} - self.attrs = {'use_mkldnn': self.use_onednn} + self.attrs = {'use_onednn': self.use_onednn} def init_kernel_type(self): self.dtype = np.float64 diff --git a/test/legacy_test/test_transpose_op.py b/test/legacy_test/test_transpose_op.py index 69f2f55dd063a7..c229b0578a8724 100644 --- a/test/legacy_test/test_transpose_op.py +++ b/test/legacy_test/test_transpose_op.py @@ -35,7 +35,7 @@ def setUp(self): self.inputs = {'X': np.random.random(self.shape).astype("float64")} self.attrs = { 'axis': list(self.axis), - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.outputs = { 'XShape': np.random.random(self.shape).astype("float64"), @@ -146,7 +146,7 @@ def setUp(self): self.inputs = {'X': np.random.random(self.shape).astype("float64")} self.attrs = { 'axis': list(self.axis), - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.outputs = { 'XShape': np.random.random(self.shape).astype("float64"), @@ -169,7 +169,7 @@ def setUp(self): self.inputs = {'X': np.random.random(self.shape).astype("float64")} self.attrs = { 'axis': list(self.axis), - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.outputs = { 'XShape': np.random.random(self.shape).astype("float64"), @@ -191,7 +191,7 @@ def setUp(self): self.inputs = {'X': np.random.random(self.shape).astype("float64")} self.attrs = { 'axis': list(self.axis), - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.outputs = { 'XShape': np.random.random(self.shape).astype("float64"), @@ -234,7 +234,7 @@ def setUp(self): self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)} self.attrs = { 'axis': list(self.axis), - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.outputs = { 'XShape': np.random.random(self.shape).astype(self.dtype), @@ -279,7 +279,7 @@ def setUp(self): self.inputs = {'X': convert_float_to_uint16(x)} self.attrs = { 'axis': list(self.axis), - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.outputs = { 'XShape': convert_float_to_uint16( @@ -330,7 +330,7 @@ def setUp(self): self.inputs = {'X': x} self.attrs = { 'axis': list(self.axis), - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.outputs = { 'XShape': np.random.random(self.shape).astype(self.dtype), @@ -376,7 +376,7 @@ def setUp(self): self.inputs = {'X': convert_float_to_uint16(x)} self.attrs = { 'axis': list(self.axis), - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.outputs = { 'XShape': convert_float_to_uint16( diff --git a/test/mkldnn/onednn_op_test.py b/test/mkldnn/onednn_op_test.py index 7eabd3b4d9c0ff..171000f910ded9 100644 --- a/test/mkldnn/onednn_op_test.py +++ b/test/mkldnn/onednn_op_test.py @@ -48,7 +48,7 @@ def check_if_onednn_primitives_exist_in_bwd( 'X': block.var('x'), }, outputs={'Out': block.var('out')}, - attrs={'use_mkldnn': True}, + attrs={'use_onednn': True}, ) # Generate backward op_desc @@ -122,7 +122,7 @@ def check_if_onednn_batchnorm_primitives_exist_in_bwd( "epsilon": test_case.epsilon, "is_test": False, "data_layout": data_layout, - "use_mkldnn": test_case.use_mkldnn, + "use_onednn": test_case.use_onednn, "fuse_with_relu": test_case.fuse_with_relu, "use_global_stats": test_case.use_global_stats, }, diff --git a/test/mkldnn/test_activation_bf16_mkldnn_op.py b/test/mkldnn/test_activation_bf16_mkldnn_op.py index e5ac9d71a044a3..d9685692eb9a72 100644 --- a/test/mkldnn/test_activation_bf16_mkldnn_op.py +++ b/test/mkldnn/test_activation_bf16_mkldnn_op.py @@ -39,7 +39,7 @@ def op_grad(self, dout, x): pass def set_attrs(self): - self.attrs = {"use_mkldnn": True} + self.attrs = {"use_onednn": True} def init_data(self): self.x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype(np.float32) @@ -147,7 +147,7 @@ def op_grad(self, dout, x): ) def set_attrs(self): - self.attrs = {"use_mkldnn": True, "approximate": True} + self.attrs = {"use_onednn": True, "approximate": True} class TestONEDNNGeluTanhDim2BF16Op(TestONEDNNGeluTanhBF16Op): @@ -211,7 +211,7 @@ def op_grad(self, dout, x): def set_attrs(self): self.alpha = 0.2 - self.attrs = {"use_mkldnn": True, "alpha": self.alpha} + self.attrs = {"use_onednn": True, "alpha": self.alpha} class TestONEDNNSwishBF16Op(ONEDNNBF16ActivationOp, TestActivation): @@ -230,7 +230,7 @@ def op_grad(self, dout, x): def set_attrs(self): self.beta = 0.2 - self.attrs = {"use_mkldnn": True, "beta": self.beta} + self.attrs = {"use_onednn": True, "beta": self.beta} class TestONEDNNHardSwishBF16Op(ONEDNNBF16ActivationOp, TestActivation): @@ -284,7 +284,7 @@ def op_grad(self, dout, x): def set_attrs(self): self.alpha = 0.2 - self.attrs = {"use_mkldnn": True, "alpha": self.alpha} + self.attrs = {"use_onednn": True, "alpha": self.alpha} class TestONEDNNExpBF16Op(ONEDNNBF16ActivationOp, TestActivation): diff --git a/test/mkldnn/test_bilinear_interp_v2_mkldnn_op.py b/test/mkldnn/test_bilinear_interp_v2_mkldnn_op.py index eca6ef8b9c7b0e..84970be1aaf057 100644 --- a/test/mkldnn/test_bilinear_interp_v2_mkldnn_op.py +++ b/test/mkldnn/test_bilinear_interp_v2_mkldnn_op.py @@ -145,7 +145,7 @@ def setUp(self): 'out_w': self.out_w, 'scale': self.scale, 'data_layout': self.data_layout, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.outputs = {'Out': output_np} diff --git a/test/mkldnn/test_cast_mkldnn_op.py b/test/mkldnn/test_cast_mkldnn_op.py index db12d0b21101bf..02aa59396208d7 100644 --- a/test/mkldnn/test_cast_mkldnn_op.py +++ b/test/mkldnn/test_cast_mkldnn_op.py @@ -42,7 +42,7 @@ def setUp(self): self.attrs = { 'in_dtype': prepare_dtype(self.x), 'out_dtype': prepare_dtype(self.out), - 'use_mkldnn': True, + 'use_onednn': True, } self.op_type = 'cast' diff --git a/test/mkldnn/test_concat_bf16_mkldnn_op.py b/test/mkldnn/test_concat_bf16_mkldnn_op.py index 606deb6976d4ac..0faf7e16482fb5 100644 --- a/test/mkldnn/test_concat_bf16_mkldnn_op.py +++ b/test/mkldnn/test_concat_bf16_mkldnn_op.py @@ -35,7 +35,7 @@ def setUp(self): self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]} self.attrs = { 'axis': self.axis, - 'use_mkldnn': True, + 'use_onednn': True, 'mkldnn_data_type': self.onednn_data_type, } diff --git a/test/mkldnn/test_concat_int8_mkldnn_op.py b/test/mkldnn/test_concat_int8_mkldnn_op.py index 89d2b71c688807..7f25b41c4191ea 100644 --- a/test/mkldnn/test_concat_int8_mkldnn_op.py +++ b/test/mkldnn/test_concat_int8_mkldnn_op.py @@ -27,7 +27,7 @@ def setUp(self): self.init_shape() self.init_test_data() self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]} - self.attrs = {'axis': self.axis, 'use_mkldnn': True} + self.attrs = {'axis': self.axis, 'use_onednn': True} self.output = np.concatenate( (self.x0, self.x1, self.x2), axis=self.axis diff --git a/test/mkldnn/test_conv2d_bf16_mkldnn_op.py b/test/mkldnn/test_conv2d_bf16_mkldnn_op.py index da802ed21ba979..562595733933df 100644 --- a/test/mkldnn/test_conv2d_bf16_mkldnn_op.py +++ b/test/mkldnn/test_conv2d_bf16_mkldnn_op.py @@ -110,7 +110,7 @@ def setUp(self): 'groups': self.groups, 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'mkldnn_data_type': self.onednn_data_type, 'force_fp32_output': self.force_fp32_output, 'fuse_residual_connection': self.fuse_residual, diff --git a/test/mkldnn/test_conv2d_int8_mkldnn_op.py b/test/mkldnn/test_conv2d_int8_mkldnn_op.py index d2e6d33607e4fe..23b3e938349b2f 100644 --- a/test/mkldnn/test_conv2d_int8_mkldnn_op.py +++ b/test/mkldnn/test_conv2d_int8_mkldnn_op.py @@ -166,7 +166,7 @@ def residual_helper(init_low, init_high, output_): 'groups': self.groups, 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'data_format': self.data_format, 'exhaustive_search': self.exhaustive_search, 'Scale_in': self.scale_in, diff --git a/test/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py b/test/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py index eaa12b49ee993f..5273b8c232a5b8 100644 --- a/test/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py +++ b/test/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py @@ -90,7 +90,7 @@ def setUp(self): 'groups': self.groups, 'dilations': self.dilations, 'is_test': self.is_test, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'mkldnn_data_type': self.onednn_data_type, 'force_fp32_output': self.force_fp32_output, 'data_format': self.data_format, diff --git a/test/mkldnn/test_elementwise_add_bf16_mkldnn_op.py b/test/mkldnn/test_elementwise_add_bf16_mkldnn_op.py index b51d7e989c371a..c552d1215267c6 100644 --- a/test/mkldnn/test_elementwise_add_bf16_mkldnn_op.py +++ b/test/mkldnn/test_elementwise_add_bf16_mkldnn_op.py @@ -36,7 +36,7 @@ def setUp(self): self.y_bf16 = convert_float_to_uint16(self.y) self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16} - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} self.outputs = {'Out': convert_float_to_uint16(self.out)} def generate_data(self): diff --git a/test/mkldnn/test_elementwise_div_mkldnn_op.py b/test/mkldnn/test_elementwise_div_mkldnn_op.py index 367c2b2b210e7b..f081f00e398a0e 100644 --- a/test/mkldnn/test_elementwise_div_mkldnn_op.py +++ b/test/mkldnn/test_elementwise_div_mkldnn_op.py @@ -37,7 +37,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} self.outputs = {'Out': self.out} def init_input_output(self): @@ -164,7 +164,7 @@ def setUp(self): self.x_bf16 = convert_float_to_uint16(self.x) self.y_bf16 = convert_float_to_uint16(self.y) self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16} - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} self.outputs = {'Out': convert_float_to_uint16(self.out)} def init_dtype(self): diff --git a/test/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py b/test/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py index 8500c7dea868ba..b138c87f0cd477 100644 --- a/test/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py +++ b/test/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py @@ -35,7 +35,7 @@ def setUp(self): self.x_bf16 = convert_float_to_uint16(self.x) self.y_bf16 = convert_float_to_uint16(self.y) self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16} - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} self.outputs = {'Out': convert_float_to_uint16(self.out)} def generate_data(self): diff --git a/test/mkldnn/test_elementwise_sub_onednn_op.py b/test/mkldnn/test_elementwise_sub_onednn_op.py index a9787c115109eb..51e30dd4d6bca4 100644 --- a/test/mkldnn/test_elementwise_sub_onednn_op.py +++ b/test/mkldnn/test_elementwise_sub_onednn_op.py @@ -44,7 +44,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} self.outputs = {'Out': self.out} def init_input_output(self): @@ -225,7 +225,7 @@ def setUp(self): self.x_bf16 = convert_float_to_uint16(self.x) self.y_bf16 = convert_float_to_uint16(self.y) self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16} - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} self.outputs = {'Out': convert_float_to_uint16(self.out)} def init_dtype(self): diff --git a/test/mkldnn/test_expand_v2_mkldnn_op.py b/test/mkldnn/test_expand_v2_mkldnn_op.py index 8d30412e510dd0..3036069b50b010 100644 --- a/test/mkldnn/test_expand_v2_mkldnn_op.py +++ b/test/mkldnn/test_expand_v2_mkldnn_op.py @@ -30,7 +30,7 @@ def setUp(self): self.op_type = "expand_v2" self.init_data() self.x = np.random.random(self.ori_shape).astype("float32") - self.attrs = {'shape': self.shape, 'use_mkldnn': True} + self.attrs = {'shape': self.shape, 'use_onednn': True} self.set_inputs() self.set_additional_inputs() output = np.tile(self.x, self.expand_times) diff --git a/test/mkldnn/test_fc_bf16_mkldnn_op.py b/test/mkldnn/test_fc_bf16_mkldnn_op.py index 05c4d6775283fd..b04120c1e7e5a6 100644 --- a/test/mkldnn/test_fc_bf16_mkldnn_op.py +++ b/test/mkldnn/test_fc_bf16_mkldnn_op.py @@ -60,7 +60,7 @@ def setUp(self): } self.attrs = { - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'force_fp32_output': self.force_fp32_output, } diff --git a/test/mkldnn/test_fc_int8_mkldnn_op.py b/test/mkldnn/test_fc_int8_mkldnn_op.py index da14db39df48da..353978b12b23d4 100644 --- a/test/mkldnn/test_fc_int8_mkldnn_op.py +++ b/test/mkldnn/test_fc_int8_mkldnn_op.py @@ -33,7 +33,7 @@ def setUp(self): ) self.attrs = { - 'use_mkldnn': True, + 'use_onednn': True, 'Scale_in': self.x_scale, 'Scale_weights': [self.y_scale] * y_scales_size, 'Scale_out': self.out_scale, diff --git a/test/mkldnn/test_fc_mkldnn_op.py b/test/mkldnn/test_fc_mkldnn_op.py index 3372238db9d9d4..b625cb57db35b1 100644 --- a/test/mkldnn/test_fc_mkldnn_op.py +++ b/test/mkldnn/test_fc_mkldnn_op.py @@ -45,7 +45,7 @@ def setUp(self): 'Bias': self.bias, } - self.attrs = {'use_mkldnn': self.use_onednn} + self.attrs = {'use_onednn': self.use_onednn} self.outputs = { 'Out': fully_connected_naive( diff --git a/test/mkldnn/test_fill_constant_mkldnn_op.py b/test/mkldnn/test_fill_constant_mkldnn_op.py index 562a0dd0ae503d..01d1feb83d06ea 100644 --- a/test/mkldnn/test_fill_constant_mkldnn_op.py +++ b/test/mkldnn/test_fill_constant_mkldnn_op.py @@ -57,7 +57,7 @@ def set_inputs(self): self.inputs = {} def set_attrs(self): - self.attrs = {'shape': (3, 5), 'use_mkldnn': True, 'value': self.value} + self.attrs = {'shape': (3, 5), 'use_onednn': True, 'value': self.value} def test_check_output(self): self.check_output(check_pir_onednn=True) @@ -87,7 +87,7 @@ def set_inputs(self): class TestFillZerosLike2DStringValueInfOneDNNOp(TestFillConstant2DOneDNNOp): def set_attrs(self): self.str_value = "inf" - self.attrs = {'shape': (10, 13), 'use_mkldnn': True, 'str_value': "inf"} + self.attrs = {'shape': (10, 13), 'use_onednn': True, 'str_value': "inf"} class TestFillZerosLike2DStringValueMinusInfOneDNNOp( @@ -97,7 +97,7 @@ def set_attrs(self): self.str_value = "-inf" self.attrs = { 'shape': (10, 13), - 'use_mkldnn': True, + 'use_onednn': True, 'str_value': "-inf", } @@ -107,7 +107,7 @@ def set_attrs(self): self.str_value = "0.123" self.attrs = { 'shape': (10, 13), - 'use_mkldnn': True, + 'use_onednn': True, 'str_value': "0.123", } diff --git a/test/mkldnn/test_flags_use_mkldnn.py b/test/mkldnn/test_flags_use_mkldnn.py index 54b2be715809c9..01d483f9f9e2fe 100644 --- a/test/mkldnn/test_flags_use_mkldnn.py +++ b/test/mkldnn/test_flags_use_mkldnn.py @@ -22,7 +22,7 @@ class TestFlagsUseOnednn(unittest.TestCase): def setUp(self): self._python_interp = sys.executable - self._python_interp += " check_flags_use_mkldnn.py" + self._python_interp += " check_flags_use_onednn.py" self.env = os.environ.copy() self.env["GLOG_v"] = "1" diff --git a/test/mkldnn/test_flatten_mkldnn_op.py b/test/mkldnn/test_flatten_mkldnn_op.py index 7bd90724082a17..2ba826e3ddc9ed 100644 --- a/test/mkldnn/test_flatten_mkldnn_op.py +++ b/test/mkldnn/test_flatten_mkldnn_op.py @@ -27,7 +27,7 @@ def setUp(self): self.set_op_type() self.init_test_case() self.set_inputs() - self.attrs = {"axis": self.axis, 'use_mkldnn': True} + self.attrs = {"axis": self.axis, 'use_onednn': True} self.ori_shape = self.inputs['X'].shape self.outputs = {"Out": self.inputs["X"].copy().reshape(self.new_shape)} diff --git a/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py b/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py index e51b67888f402a..6248a7fe7e102e 100644 --- a/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py +++ b/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py @@ -129,7 +129,7 @@ def setUp(self): 'is_reverse': self.is_reverse, 'origin_mode': self.origin_mode, 'force_fp32_output': self.force_fp32_output, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'mkldnn_data_type': self.onednn_data_type, } diff --git a/test/mkldnn/test_fusion_gru_int8_mkldnn_op.py b/test/mkldnn/test_fusion_gru_int8_mkldnn_op.py index 043a5eaa074030..e88fce1507f884 100644 --- a/test/mkldnn/test_fusion_gru_int8_mkldnn_op.py +++ b/test/mkldnn/test_fusion_gru_int8_mkldnn_op.py @@ -141,7 +141,7 @@ def setUp(self): 'gate_activation': self.act_gate, 'is_reverse': self.is_reverse, 'origin_mode': self.origin_mode, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'mkldnn_data_type': self.onednn_data_type, 'force_fp32_output': self.force_fp32_output, 'Scale_data': scale_data, diff --git a/test/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py b/test/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py index f87b674f59c6ae..bff4586e3d0c0e 100644 --- a/test/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py +++ b/test/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py @@ -145,7 +145,7 @@ def setUp(self): 'cell_activation': self.act_cell, 'candidate_activation': self.act_cand, 'force_fp32_output': self.force_fp32_output, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'mkldnn_data_type': self.onednn_data_type, } diff --git a/test/mkldnn/test_fusion_lstm_int8_mkldnn_op.py b/test/mkldnn/test_fusion_lstm_int8_mkldnn_op.py index 198bc2685cec49..c27e7b226fd283 100644 --- a/test/mkldnn/test_fusion_lstm_int8_mkldnn_op.py +++ b/test/mkldnn/test_fusion_lstm_int8_mkldnn_op.py @@ -130,7 +130,7 @@ def setUp(self): 'candidate_activation': self.act_cand, 'is_reverse': self.is_reverse, 'use_peepholes': self.use_peepholes, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'mkldnn_data_type': self.onednn_data_type, 'force_fp32_output': self.force_fp32_output, 'Scale_data': scale_data, diff --git a/test/mkldnn/test_gaussian_random_mkldnn_op.py b/test/mkldnn/test_gaussian_random_mkldnn_op.py index 84bcea864c306f..d45c678769a857 100644 --- a/test/mkldnn/test_gaussian_random_mkldnn_op.py +++ b/test/mkldnn/test_gaussian_random_mkldnn_op.py @@ -40,7 +40,7 @@ def setUp(self): "mean": 1.0, "std": 2.0, "seed": 10, - "use_mkldnn": self.use_onednn, + "use_onednn": self.use_onednn, } @@ -57,7 +57,7 @@ def setUp(self): "mean": self.mean, "std": self.std, "seed": 10, - "use_mkldnn": self.use_onednn, + "use_onednn": self.use_onednn, } paddle.seed(10) diff --git a/test/mkldnn/test_log_softmax_mkldnn_op.py b/test/mkldnn/test_log_softmax_mkldnn_op.py index 9f4807acb3fbc2..6d838bc86ff9c1 100644 --- a/test/mkldnn/test_log_softmax_mkldnn_op.py +++ b/test/mkldnn/test_log_softmax_mkldnn_op.py @@ -44,7 +44,7 @@ def setUp(self): self.inputs = {'X': x} self.outputs = {'Out': out} - self.attrs = {'axis': self.axis, 'use_mkldnn': True} + self.attrs = {'axis': self.axis, 'use_onednn': True} def set_dtype(self): self.dtype = np.float32 diff --git a/test/mkldnn/test_lrn_mkldnn_op.py b/test/mkldnn/test_lrn_mkldnn_op.py index 046bad391ee09b..874c73628d77a1 100644 --- a/test/mkldnn/test_lrn_mkldnn_op.py +++ b/test/mkldnn/test_lrn_mkldnn_op.py @@ -22,7 +22,7 @@ class TestLRNONEDNNOp(TestLRNOp): def get_attrs(self): attrs = TestLRNOp.get_attrs(self) - attrs['use_mkldnn'] = True + attrs['use_onednn'] = True return attrs def test_check_output(self): diff --git a/test/mkldnn/test_matmul_bf16_mkldnn_op.py b/test/mkldnn/test_matmul_bf16_mkldnn_op.py index 8f9e932620714e..78a943e73d889d 100644 --- a/test/mkldnn/test_matmul_bf16_mkldnn_op.py +++ b/test/mkldnn/test_matmul_bf16_mkldnn_op.py @@ -33,7 +33,7 @@ def generate_data(self): def set_attributes(self): self.attrs = { 'alpha': self.alpha, - "use_mkldnn": self.use_onednn, + "use_onednn": self.use_onednn, "mkldnn_data_type": self.onednn_data_type, "force_fp32_output": self.force_fp32_output, 'transpose_X': False, @@ -146,7 +146,7 @@ def generate_data(self): def set_attributes(self): self.attrs = { - "use_mkldnn": self.use_onednn, + "use_onednn": self.use_onednn, "mkldnn_data_type": self.onednn_data_type, 'transpose_X': True, 'transpose_Y': False, @@ -161,7 +161,7 @@ def generate_data(self): def set_attributes(self): self.attrs = { - "use_mkldnn": self.use_onednn, + "use_onednn": self.use_onednn, "mkldnn_data_type": self.onednn_data_type, 'transpose_Y': True, 'transpose_X': False, diff --git a/test/mkldnn/test_matmul_v2_mkldnn_op.py b/test/mkldnn/test_matmul_v2_mkldnn_op.py index 836fa86c6d43d6..4c132ebef63bb1 100644 --- a/test/mkldnn/test_matmul_v2_mkldnn_op.py +++ b/test/mkldnn/test_matmul_v2_mkldnn_op.py @@ -77,7 +77,7 @@ def setUp(self): self.attrs = { 'trans_x': self.trans_x, 'trans_y': self.trans_y, - 'use_mkldnn': True, + 'use_onednn': True, } self.set_dtype_attr() self.outputs = {'Out': result} diff --git a/test/mkldnn/test_mul_int8_mkldnn_op.py b/test/mkldnn/test_mul_int8_mkldnn_op.py index 71db940a027e0c..802a2e9d4aae73 100644 --- a/test/mkldnn/test_mul_int8_mkldnn_op.py +++ b/test/mkldnn/test_mul_int8_mkldnn_op.py @@ -35,7 +35,7 @@ def setUp(self): self.init_data_type() self.init_data() self.attrs = { - "use_mkldnn": self.use_onednn, + "use_onednn": self.use_onednn, "scale_x": self.scale_x, "scale_y": self.scale_y, "scale_out": self.scale_out, @@ -106,7 +106,7 @@ def setUp(self): self.init_data_type() self.init_data() self.attrs = { - "use_mkldnn": self.use_onednn, + "use_onednn": self.use_onednn, "scale_x": self.scale_x, "scale_y": self.scale_y, "scale_out": self.scale_out, diff --git a/test/mkldnn/test_mul_mkldnn_op.py b/test/mkldnn/test_mul_mkldnn_op.py index 9759a581dbb4cf..d528631246b779 100644 --- a/test/mkldnn/test_mul_mkldnn_op.py +++ b/test/mkldnn/test_mul_mkldnn_op.py @@ -25,7 +25,7 @@ class TestMulOneDNNOp(OpTest): def setUp(self): self.op_type = "mul" - self.attrs = {'use_mkldnn': True} + self.attrs = {'use_onednn': True} self.init_shapes_and_attrs() self.x_fp32 = np.random.random(self.x_shape).astype(np.float32) diff --git a/test/mkldnn/test_multi_gru_mkldnn_op.py b/test/mkldnn/test_multi_gru_mkldnn_op.py index f4d2b9cb9e60d9..ea6fc57bc94ae2 100644 --- a/test/mkldnn/test_multi_gru_mkldnn_op.py +++ b/test/mkldnn/test_multi_gru_mkldnn_op.py @@ -194,7 +194,7 @@ def setUp(self): 'gate_activation': 'sigmoid', 'layers': self.layers, 'origin_mode': self.origin_mode, - 'use_mkldnn': True, + 'use_onednn': True, } if is_int8: diff --git a/test/mkldnn/test_nearest_interp_v2_mkldnn_op.py b/test/mkldnn/test_nearest_interp_v2_mkldnn_op.py index e1ae1bcf3b7c6b..caf65abd9cc4ea 100644 --- a/test/mkldnn/test_nearest_interp_v2_mkldnn_op.py +++ b/test/mkldnn/test_nearest_interp_v2_mkldnn_op.py @@ -146,7 +146,7 @@ def setUp(self): 'out_w': self.out_w, 'scale': self.scale, 'data_layout': self.data_layout, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.outputs = {'Out': output_np} diff --git a/test/mkldnn/test_reduce_bf16_mkldnn_op.py b/test/mkldnn/test_reduce_bf16_mkldnn_op.py index 91606f6bf6329e..b8f0e497bbdaad 100644 --- a/test/mkldnn/test_reduce_bf16_mkldnn_op.py +++ b/test/mkldnn/test_reduce_bf16_mkldnn_op.py @@ -37,7 +37,7 @@ def setUp(self): self.x_bf16 = convert_float_to_uint16(self.x_fp32) self.inputs = {'X': self.x_bf16} self.outputs = {'Out': self.x_fp32.sum(axis=0)} - self.attrs = {'use_mkldnn': self.use_onednn} + self.attrs = {'use_onednn': self.use_onednn} def test_check_output(self): self.check_output( @@ -100,7 +100,7 @@ def setUp(self): self.x_fp32 = np.random.normal(size=(2, 3, 5, 6)).astype('float32') self.x_bf16 = convert_float_to_uint16(self.x_fp32) self.inputs = {'X': self.x_bf16} - self.attrs = {'use_mkldnn': self.use_onednn, 'dim': [0, 1, 2, 3]} + self.attrs = {'use_onednn': self.use_onednn, 'dim': [0, 1, 2, 3]} self.outputs = {'Out': self.x_fp32.sum(axis=tuple(self.attrs['dim']))} @@ -113,7 +113,7 @@ def setUp(self): self.x_fp32 = np.random.normal(size=(4, 7, 6, 6)).astype('float32') self.x_bf16 = convert_float_to_uint16(self.x_fp32) self.inputs = {'X': self.x_bf16} - self.attrs = {'use_mkldnn': self.use_onednn, 'dim': [-1, -2, -3, -4]} + self.attrs = {'use_onednn': self.use_onednn, 'dim': [-1, -2, -3, -4]} self.outputs = {'Out': self.x_fp32.sum(axis=tuple(self.attrs['dim']))} @@ -126,7 +126,7 @@ def setUp(self): self.x_fp32 = np.random.normal(size=(2, 5, 3, 2, 5)).astype('float32') self.x_bf16 = convert_float_to_uint16(self.x_fp32) self.inputs = {'X': self.x_bf16} - self.attrs = {'reduce_all': True, 'keep_dim': True, 'use_mkldnn': True} + self.attrs = {'reduce_all': True, 'keep_dim': True, 'use_onednn': True} self.outputs = {'Out': self.x_fp32.sum(keepdims=self.attrs['keep_dim'])} @@ -139,7 +139,7 @@ def setUp(self): self.x_fp32 = np.random.normal(size=(4, 5, 4, 5)).astype('float32') self.x_bf16 = convert_float_to_uint16(self.x_fp32) self.inputs = {'X': self.x_bf16} - self.attrs = {'reduce_all': True, 'use_mkldnn': self.use_onednn} + self.attrs = {'reduce_all': True, 'use_onednn': self.use_onednn} self.outputs = {'Out': self.x_fp32.sum()} @@ -156,7 +156,7 @@ def setUp(self): self.x_fp32 = np.random.random((5, 6, 10)).astype("float32") self.x_bf16 = convert_float_to_uint16(self.x_fp32) self.inputs = {'X': self.x_bf16} - self.attrs = {'dim': [-1], 'use_mkldnn': self.use_onednn} + self.attrs = {'dim': [-1], 'use_onednn': self.use_onednn} self.outputs = {'Out': self.x_fp32.max(axis=tuple(self.attrs['dim']))} @@ -175,7 +175,7 @@ def setUp(self): self.x_fp32 = np.random.random((5, 6, 10, 9)).astype("float32") self.x_bf16 = convert_float_to_uint16(self.x_fp32) self.inputs = {'X': self.x_bf16} - self.attrs = {'dim': [-1, 0, 1], 'use_mkldnn': self.use_onednn} + self.attrs = {'dim': [-1, 0, 1], 'use_onednn': self.use_onednn} self.outputs = {'Out': self.x_fp32.max(axis=tuple(self.attrs['dim']))} @@ -192,7 +192,7 @@ def setUp(self): self.x_fp32 = np.random.random((5, 6, 10)).astype("float32") self.x_bf16 = convert_float_to_uint16(self.x_fp32) self.inputs = {'X': self.x_bf16} - self.attrs = {'dim': [2], 'use_mkldnn': self.use_onednn} + self.attrs = {'dim': [2], 'use_onednn': self.use_onednn} self.outputs = {'Out': self.x_fp32.min(axis=tuple(self.attrs['dim']))} @@ -203,7 +203,7 @@ def setUp(self): self.x_fp32 = np.random.random((5, 6, 10)).astype("float32") self.x_bf16 = convert_float_to_uint16(self.x_fp32) self.inputs = {'X': self.x_bf16} - self.attrs = {'use_mkldnn': self.use_onednn} + self.attrs = {'use_onednn': self.use_onednn} self.outputs = {'Out': self.x_fp32.sum(axis=0) / self.x_fp32.shape[0]} @@ -214,7 +214,7 @@ def setUp(self): self.x_fp32 = np.random.random((5, 6, 3, 5)).astype("float32") self.x_bf16 = convert_float_to_uint16(self.x_fp32) self.inputs = {'X': self.x_bf16} - self.attrs = {'use_mkldnn': self.use_onednn, 'dim': [0, 1]} + self.attrs = {'use_onednn': self.use_onednn, 'dim': [0, 1]} self.outputs = { 'Out': self.x_fp32.sum(axis=tuple(self.attrs['dim'])) / (self.x_fp32.shape[0] * self.x_fp32.shape[1]) diff --git a/test/mkldnn/test_reshape_bf16_op.py b/test/mkldnn/test_reshape_bf16_op.py index 94978e67d81468..587e348644c66a 100644 --- a/test/mkldnn/test_reshape_bf16_op.py +++ b/test/mkldnn/test_reshape_bf16_op.py @@ -35,7 +35,7 @@ def setUp(self): self.inputs = {'X': self.input_data} self.attrs = { 'shape': self.new_shape, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'mkldnn_data_type': self.onednn_data_type, } self.outputs = { diff --git a/test/mkldnn/test_scale_bf16_mkldnn_op.py b/test/mkldnn/test_scale_bf16_mkldnn_op.py index 26943471b285dd..2ababf6f4441d4 100644 --- a/test/mkldnn/test_scale_bf16_mkldnn_op.py +++ b/test/mkldnn/test_scale_bf16_mkldnn_op.py @@ -35,7 +35,7 @@ def setUp(self): self.x_bf16 = convert_float_to_uint16(self.x_fp32) self.scale = -2.3 self.inputs = {'X': self.x_bf16} - self.attrs = {'scale': self.scale, 'use_mkldnn': True, 'bias': 0.4} + self.attrs = {'scale': self.scale, 'use_onednn': True, 'bias': 0.4} self.use_onednn = True self.outputs = { 'Out': (self.x_fp32 * self.attrs['scale']) + self.attrs['bias'] @@ -78,7 +78,7 @@ def setUp(self): self.inputs = {'X': self.x_bf16} self.attrs = { 'scale': self.scale, - 'use_mkldnn': True, + 'use_onednn': True, 'bias': 0.0, 'bias_after_scale': False, } @@ -99,7 +99,7 @@ def setUp(self): 'X': self.x_bf16, 'ScaleTensor': convert_float_to_uint16(self.scale_tensor), } - self.attrs = {'use_mkldnn': True} + self.attrs = {'use_onednn': True} self.outputs = {'Out': self.x_fp32 * self.scale} @@ -117,7 +117,7 @@ def setUp(self): self.attrs = { 'bias': -1.1, 'bias_after_scale': False, - 'use_mkldnn': True, + 'use_onednn': True, } self.outputs = {'Out': (self.x_fp32 + self.attrs['bias']) * self.scale} diff --git a/test/mkldnn/test_shuffle_channel_mkldnn_op.py b/test/mkldnn/test_shuffle_channel_mkldnn_op.py index e9510c96369617..36e10885a6c707 100644 --- a/test/mkldnn/test_shuffle_channel_mkldnn_op.py +++ b/test/mkldnn/test_shuffle_channel_mkldnn_op.py @@ -28,7 +28,7 @@ def setUp(self): self.set_dtype() self.set_group() self.inputs = {'X': np.random.random((5, 64, 2, 3)).astype(self.dtype)} - self.attrs = {'use_mkldnn': True, 'group': self.group} + self.attrs = {'use_onednn': True, 'group': self.group} _, c, h, w = self.inputs['X'].shape input_reshaped = np.reshape( diff --git a/test/mkldnn/test_slice_mkldnn_op.py b/test/mkldnn/test_slice_mkldnn_op.py index 1a71278a9f2167..e95b9626add571 100644 --- a/test/mkldnn/test_slice_mkldnn_op.py +++ b/test/mkldnn/test_slice_mkldnn_op.py @@ -36,7 +36,7 @@ def setUp(self): 'starts': self.starts, 'ends': self.ends, 'infer_flags': self.infer_flags, - 'use_mkldnn': True, + 'use_onednn': True, } self.set_attrs() diff --git a/test/mkldnn/test_softmax_bf16_mkldnn_op.py b/test/mkldnn/test_softmax_bf16_mkldnn_op.py index b52dda9aa724ce..31b16cb38e0079 100644 --- a/test/mkldnn/test_softmax_bf16_mkldnn_op.py +++ b/test/mkldnn/test_softmax_bf16_mkldnn_op.py @@ -64,7 +64,7 @@ def setUp(self): self.inputs = {'X': convert_float_to_uint16(x)} self.outputs = {'Out': out} - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} def test_check_output(self): self.check_output_with_place(core.CPUPlace(), check_pir_onednn=True) diff --git a/test/mkldnn/test_softplus_mkldnn_op.py b/test/mkldnn/test_softplus_mkldnn_op.py index 0949b63cc2c59d..5903a9faf32193 100644 --- a/test/mkldnn/test_softplus_mkldnn_op.py +++ b/test/mkldnn/test_softplus_mkldnn_op.py @@ -37,7 +37,7 @@ def setUp(self): self.threshold = 20 self.config() self.set_dtype() - self.attrs = {'use_mkldnn': True, 'beta': self.beta} + self.attrs = {'use_onednn': True, 'beta': self.beta} self.x = np.random.random(self.x_shape) self.out = ref_softplus(self.x, self.beta, self.threshold) diff --git a/test/mkldnn/test_split_bf16_mkldnn_op.py b/test/mkldnn/test_split_bf16_mkldnn_op.py index ae8edba09fc74d..3234941a8ed553 100644 --- a/test/mkldnn/test_split_bf16_mkldnn_op.py +++ b/test/mkldnn/test_split_bf16_mkldnn_op.py @@ -45,7 +45,7 @@ def setUp(self): self.init_data() self.inputs = {'X': self.x} self.attrs = { - 'use_mkldnn': True, + 'use_onednn': True, 'num': self.num, 'mkldnn_data_type': "bfloat16", } diff --git a/test/mkldnn/test_squeeze2_mkldnn_op.py b/test/mkldnn/test_squeeze2_mkldnn_op.py index fc0f731f35b681..9e2a4bb774b99f 100644 --- a/test/mkldnn/test_squeeze2_mkldnn_op.py +++ b/test/mkldnn/test_squeeze2_mkldnn_op.py @@ -38,7 +38,7 @@ def set_inputs(self): self.inputs = {"X": self.x} def init_attrs(self): - self.attrs = {"axes": self.axes, 'use_mkldnn': True} + self.attrs = {"axes": self.axes, 'use_onednn': True} def set_outputs(self): self.outputs = { diff --git a/test/mkldnn/test_stack_mkldnn_op.py b/test/mkldnn/test_stack_mkldnn_op.py index 8b91c246d6e6b0..2bd48e74a377e1 100644 --- a/test/mkldnn/test_stack_mkldnn_op.py +++ b/test/mkldnn/test_stack_mkldnn_op.py @@ -56,7 +56,7 @@ def setUp(self): self.inputs = {'X': input_list} self.outputs = {'Y': np.stack(self.op_inputs, axis=self.axis)} - self.attrs = {'axis': self.axis, 'use_mkldnn': True} + self.attrs = {'axis': self.axis, 'use_onednn': True} def test_check_output(self): self.check_output_with_place(core.CPUPlace(), check_pir_onednn=True) diff --git a/test/mkldnn/test_sum_bf16_mkldnn_op.py b/test/mkldnn/test_sum_bf16_mkldnn_op.py index 341a17416df3e4..9bc17c6c168fa3 100644 --- a/test/mkldnn/test_sum_bf16_mkldnn_op.py +++ b/test/mkldnn/test_sum_bf16_mkldnn_op.py @@ -45,7 +45,7 @@ def setUp(self): y = x0 + x1 + x2 self.outputs = {'Out': convert_float_to_uint16(y)} - self.attrs = {'use_mkldnn': self.use_onednn} + self.attrs = {'use_onednn': self.use_onednn} def test_check_output(self): self.check_output_with_place(core.CPUPlace(), check_pir_onednn=True) diff --git a/test/mkldnn/test_transpose_bf16_mkldnn_op.py b/test/mkldnn/test_transpose_bf16_mkldnn_op.py index 8f0d5e9a952143..89c597a6d0de25 100644 --- a/test/mkldnn/test_transpose_bf16_mkldnn_op.py +++ b/test/mkldnn/test_transpose_bf16_mkldnn_op.py @@ -37,7 +37,7 @@ def setUp(self): self.attrs = { 'axis': list(self.axis), - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'mkldnn_data_type': self.onednn_data_type, } diff --git a/test/mkldnn/test_transpose_int8_mkldnn_op.py b/test/mkldnn/test_transpose_int8_mkldnn_op.py index eefdc3dae12fb4..65205a9511c42f 100644 --- a/test/mkldnn/test_transpose_int8_mkldnn_op.py +++ b/test/mkldnn/test_transpose_int8_mkldnn_op.py @@ -36,7 +36,7 @@ def setUp(self): self.attrs = { 'axis': list(self.axis), - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.outputs = { diff --git a/test/mkldnn/test_transpose_mkldnn_op.py b/test/mkldnn/test_transpose_mkldnn_op.py index 125128a73b131d..f4a4bdaf173d9b 100644 --- a/test/mkldnn/test_transpose_mkldnn_op.py +++ b/test/mkldnn/test_transpose_mkldnn_op.py @@ -25,7 +25,7 @@ def setUp(self): self.inputs = {'X': np.random.random(self.shape).astype("float32")} self.attrs = { 'axis': list(self.axis), - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, } self.outputs = { 'XShape': np.random.random(self.shape).astype("float32"), diff --git a/test/quantization/CMakeLists.txt b/test/quantization/CMakeLists.txt index 20082befcba268..c2f533b9b31d8c 100644 --- a/test/quantization/CMakeLists.txt +++ b/test/quantization/CMakeLists.txt @@ -6,13 +6,13 @@ file( string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") function(_inference_analysis_python_api_int8_test target model_dir data_path - filename use_mkldnn) + filename use_onednn) py_test( ${target} SRCS ${filename} ENVS CPU_NUM_THREADS=${CPU_NUM_THREADS_ON_CI} - FLAGS_use_onednn=${use_mkldnn} + FLAGS_use_onednn=${use_onednn} ARGS --infer_model ${model_dir}/model diff --git a/test/quantization/README.md b/test/quantization/README.md index eeb4b838fe7648..3137a49be0e5d3 100644 --- a/test/quantization/README.md +++ b/test/quantization/README.md @@ -264,7 +264,7 @@ The following options are also accepted: ```bash cd /PATH/TO/PADDLE -OMP_NUM_THREADS=28 FLAGS_use_mkldnn=true python python/paddle/static/quantization/slim/tests/quant2_int8_image_classification_comparison.py --quant_model=/PATH/TO/DOWNLOADED/QUANT/MODEL --fp32_model=/PATH/TO/DOWNLOADED/FP32/MODEL --infer_data=$HOME/.cache/paddle/dataset/int8/download/int8_full_val.bin --batch_size=50 --batch_num=1000 --acc_diff_threshold=0.01 --ops_to_quantize="conv2d,pool2d" +OMP_NUM_THREADS=28 FLAGS_use_onednn=true python python/paddle/static/quantization/slim/tests/quant2_int8_image_classification_comparison.py --quant_model=/PATH/TO/DOWNLOADED/QUANT/MODEL --fp32_model=/PATH/TO/DOWNLOADED/FP32/MODEL --infer_data=$HOME/.cache/paddle/dataset/int8/download/int8_full_val.bin --batch_size=50 --batch_num=1000 --acc_diff_threshold=0.01 --ops_to_quantize="conv2d,pool2d" ``` > Notes: Due to a large amount of images in the `int8_full_val.bin` dataset (50 000), the accuracy benchmark may last long. To accelerate accuracy measuring, it is recommended to set `OMP_NUM_THREADS` to the maximum number of physical cores available on the server. diff --git a/test/xpu/op_test_xpu.py b/test/xpu/op_test_xpu.py index 956506bd47e1c0..875280639bcec7 100644 --- a/test/xpu/op_test_xpu.py +++ b/test/xpu/op_test_xpu.py @@ -292,8 +292,8 @@ def get_grad_with_place( # oneDNN numeric gradient should use CPU kernel use_onednn = False - if op_attrs.get("use_mkldnn"): - op_attrs["use_mkldnn"] = False + if op_attrs.get("use_onednn"): + op_attrs["use_onednn"] = False use_onednn = True mean_grad_op_types = get_xpu_op_support_types('mean') @@ -311,7 +311,7 @@ def get_grad_with_place( ) if use_onednn: - op_attrs["use_mkldnn"] = True + op_attrs["use_onednn"] = True if no_grad_set is None: no_grad_set = set() diff --git a/test/xpu/test_batch_norm_op_xpu.py b/test/xpu/test_batch_norm_op_xpu.py index 97ab78297934dd..6bbc3efe16c7f2 100644 --- a/test/xpu/test_batch_norm_op_xpu.py +++ b/test/xpu/test_batch_norm_op_xpu.py @@ -448,7 +448,7 @@ def test_train(self): "epsilon": self.epsilon, "is_test": False, "data_layout": self.data_layout, - "use_mkldnn": False, + "use_onednn": False, "fuse_with_relu": False, "use_global_stats": self.use_global_stats, } diff --git a/test/xpu/test_conv2d_op_xpu.py b/test/xpu/test_conv2d_op_xpu.py index e93f5b89e35d0e..16b80018905c3e 100644 --- a/test/xpu/test_conv2d_op_xpu.py +++ b/test/xpu/test_conv2d_op_xpu.py @@ -241,7 +241,7 @@ def setUp(self): 'groups': self.groups, 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'data_format': self.data_format, 'fuse_relu_before_depthwise_conv': self.fuse_relu_before_depthwise_conv, 'exhaustive_search': self.exhaustive_search, @@ -402,7 +402,7 @@ def setUp(self): 'groups': self.groups, 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'data_format': self.data_format, 'fuse_relu_before_depthwise_conv': self.fuse_relu_before_depthwise_conv, 'exhaustive_search': self.exhaustive_search, diff --git a/test/xpu/test_conv2d_transpose_op_xpu.py b/test/xpu/test_conv2d_transpose_op_xpu.py index 487fa004c105c9..8d7070a6697c5e 100644 --- a/test/xpu/test_conv2d_transpose_op_xpu.py +++ b/test/xpu/test_conv2d_transpose_op_xpu.py @@ -168,7 +168,7 @@ def setUp(self): 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, 'is_test': self.is_test, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'data_format': self.data_format, } if self.output_size is not None: diff --git a/test/xpu/test_conv3d_op_xpu.py b/test/xpu/test_conv3d_op_xpu.py index b198370a87767a..6a96930339129a 100644 --- a/test/xpu/test_conv3d_op_xpu.py +++ b/test/xpu/test_conv3d_op_xpu.py @@ -244,7 +244,7 @@ def setUp(self): 'groups': self.groups, 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'data_format': self.data_format, } self.outputs = {'Output': output} @@ -419,7 +419,7 @@ def setUp(self): 'groups': self.groups, 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'data_format': self.data_format, } self.outputs = {'Output': output} diff --git a/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py b/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py index 578cd3b9c88d85..7c59ded26f6792 100644 --- a/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py +++ b/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py @@ -168,7 +168,7 @@ def setUp(self): 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, 'is_test': self.is_test, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'data_format': self.data_format, } if self.output_size is not None: diff --git a/test/xpu/test_elementwise_add_op_xpu.py b/test/xpu/test_elementwise_add_op_xpu.py index ee0c70d75b3341..7f8fc159b1d588 100644 --- a/test/xpu/test_elementwise_add_op_xpu.py +++ b/test/xpu/test_elementwise_add_op_xpu.py @@ -49,7 +49,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} self.outputs = {'Out': self.out} def test_check_output(self): diff --git a/test/xpu/test_elementwise_add_op_xpu_kp.py b/test/xpu/test_elementwise_add_op_xpu_kp.py index 857e8d72b188cc..d3ef8e332c06e0 100644 --- a/test/xpu/test_elementwise_add_op_xpu_kp.py +++ b/test/xpu/test_elementwise_add_op_xpu_kp.py @@ -39,7 +39,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} self.outputs = {'Out': self.out} def test_check_output(self): diff --git a/test/xpu/test_elementwise_floordiv_op_xpu.py b/test/xpu/test_elementwise_floordiv_op_xpu.py index f5e1a0ecc8356a..a4795874874a21 100644 --- a/test/xpu/test_elementwise_floordiv_op_xpu.py +++ b/test/xpu/test_elementwise_floordiv_op_xpu.py @@ -50,7 +50,7 @@ def setUp(self): 'X': OpTest.np_dtype_to_base_dtype(self.x), 'Y': OpTest.np_dtype_to_base_dtype(self.y), } - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} self.outputs = {'Out': self.out} def test_check_output(self): diff --git a/test/xpu/test_elementwise_mod_op_xpu.py b/test/xpu/test_elementwise_mod_op_xpu.py index 035595d2e36e84..b3d212ada318c2 100644 --- a/test/xpu/test_elementwise_mod_op_xpu.py +++ b/test/xpu/test_elementwise_mod_op_xpu.py @@ -48,7 +48,7 @@ def init_input_output(self): 'Y': OpTest.np_dtype_to_base_dtype(self.y), } self.outputs = {'Out': self.out} - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} def init_dtype(self): pass @@ -81,7 +81,7 @@ def init_input_output(self): 'Y': OpTest.np_dtype_to_base_dtype(self.y), } self.outputs = {'Out': self.out} - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} class TestRemainderOp(unittest.TestCase): def test_dygraph(self): diff --git a/test/xpu/test_elementwise_mul_op_xpu.py b/test/xpu/test_elementwise_mul_op_xpu.py index c50de0285d66c1..c7116ea8f42905 100644 --- a/test/xpu/test_elementwise_mul_op_xpu.py +++ b/test/xpu/test_elementwise_mul_op_xpu.py @@ -126,7 +126,7 @@ def init_input_output(self): 'Y': self.y, } self.outputs = {'Out': self.out} - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} def init_axis(self): pass @@ -281,7 +281,7 @@ def init_input_output(self): 'Y': self.y, } self.outputs = {'Out': self.out} - self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_onednn} + self.attrs = {'axis': self.axis, 'use_onednn': self.use_onednn} def gen_output(self): if self.cal_x is None: diff --git a/test/xpu/test_gaussian_random_op_xpu.py b/test/xpu/test_gaussian_random_op_xpu.py index f457e0056da782..d2bec51113d8fe 100644 --- a/test/xpu/test_gaussian_random_op_xpu.py +++ b/test/xpu/test_gaussian_random_op_xpu.py @@ -66,7 +66,7 @@ def setUp(self): "mean": self.mean, "std": self.std, "seed": 10, - "use_mkldnn": self.use_onednn, + "use_onednn": self.use_onednn, "dtype": typeid_dict[self.in_type_str], } paddle.seed(10) @@ -119,7 +119,7 @@ def setUp(self): 'mean': self.mean, 'std': self.std, 'seed': self.seed, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, "dtype": typeid_dict[self.in_type_str], } @@ -185,7 +185,7 @@ def setUp(self): 'mean': self.mean, 'std': self.std, 'seed': self.seed, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, "dtype": typeid_dict[self.in_type_str], } self.outputs = {'Out': np.zeros((123, 92), dtype=self.dtype)} diff --git a/test/xpu/test_pool2d_op_xpu.py b/test/xpu/test_pool2d_op_xpu.py index 1aab84bc6f11b6..a5cc545e7e7d22 100644 --- a/test/xpu/test_pool2d_op_xpu.py +++ b/test/xpu/test_pool2d_op_xpu.py @@ -331,7 +331,7 @@ def setUp(self): 'pooling_type': self.pool_type, 'global_pooling': self.global_pool, 'use_cudnn': self.use_cudnn, - 'use_mkldnn': self.use_onednn, + 'use_onednn': self.use_onednn, 'data_format': self.data_format, 'exclusive': self.exclusive, 'adaptive': self.adaptive, diff --git a/test/xpu/test_transpose_op_xpu.py b/test/xpu/test_transpose_op_xpu.py index 8188984165969e..c46b7174b5def1 100644 --- a/test/xpu/test_transpose_op_xpu.py +++ b/test/xpu/test_transpose_op_xpu.py @@ -40,7 +40,7 @@ def setUp(self): self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)} self.attrs = { 'axis': list(self.axis), - 'use_mkldnn': False, + 'use_onednn': False, 'use_xpu': True, } self.outputs = { @@ -156,7 +156,7 @@ def setUp(self): } self.attrs = { 'axis': list(self.axis), - 'use_mkldnn': False, + 'use_onednn': False, 'use_xpu': True, } self.outputs = { diff --git a/tools/xpu/pack_paddle_dependence.sh b/tools/xpu/pack_paddle_dependence.sh index 0cbc258b0f7610..3996d5cd76b067 100644 --- a/tools/xpu/pack_paddle_dependence.sh +++ b/tools/xpu/pack_paddle_dependence.sh @@ -94,10 +94,8 @@ function xhpc_prepare() { cp -r ${XHPC_DIR_NAME}/xpudnn/so/libxpu_dnn.so xpu/lib/ if [[ "${WITH_MKL}" == "ON" ]]; then - cp -r ${BUILD_DIR}/third_party/install/mklml/lib/libiomp5.so xpu/lib/ - pushd xpu/lib - ln -sf libiomp5.so libomp.so - popd + # Now xpu/lib/libomp.so is invalid. When we need libomp.so, libomp.so is valid. + ln -sf ${BUILD_DIR}/third_party/install/mklml/lib/libiomp5.so xpu/lib/libomp.so else cp -r ${XHPC_DIR_NAME}/xpudnn/so/libomp.so xpu/lib/ pushd xpu/lib @@ -160,10 +158,8 @@ function local_assemble() { cp -r ${LOCAL_PATH}/${XHPC_DIR_NAME}/xpudnn/so/libxpu_dnn.so xpu/lib/ if [[ "${WITH_MKL}" == "ON" ]]; then - cp -r ${BUILD_DIR}/third_party/install/mklml/lib/libiomp5.so xpu/lib/ - pushd xpu/lib - ln -sf libiomp5.so libomp.so - popd + # Now xpu/lib/libomp.so is invalid. When we need libomp.so, libomp.so is valid. + ln -sf ${BUILD_DIR}/third_party/install/mklml/lib/libiomp5.so xpu/lib/libomp.so else cp -r ${XHPC_DIR_NAME}/xpudnn/so/libomp.so xpu/lib/ pushd xpu/lib