diff --git a/clang/lib/DPCT/RulesLang/APINamesMath.inc b/clang/lib/DPCT/RulesLang/APINamesMath.inc index c6394ae897bc..8c18aadb5f38 100644 --- a/clang/lib/DPCT/RulesLang/APINamesMath.inc +++ b/clang/lib/DPCT/RulesLang/APINamesMath.inc @@ -615,6 +615,42 @@ ENTRY_REWRITE("__vcmpne2") ENTRY_REWRITE("__vcmpne4") ENTRY_REWRITE("__vhaddu2") ENTRY_REWRITE("__vhaddu4") +ENTRY_REWRITE("__viaddmax_s16x2") +ENTRY_REWRITE("__viaddmax_s16x2_relu") +ENTRY_REWRITE("__viaddmax_s32") +ENTRY_REWRITE("__viaddmax_s32_relu") +ENTRY_REWRITE("__viaddmax_u16x2") +ENTRY_REWRITE("__viaddmax_u32") +ENTRY_REWRITE("__viaddmin_s16x2") +ENTRY_REWRITE("__viaddmin_s16x2_relu") +ENTRY_REWRITE("__viaddmin_s32") +ENTRY_REWRITE("__viaddmin_s32_relu") +ENTRY_REWRITE("__viaddmin_u16x2") +ENTRY_REWRITE("__viaddmin_u32") +ENTRY_REWRITE("__vibmax_s16x2") +ENTRY_REWRITE("__vibmax_s32") +ENTRY_REWRITE("__vibmax_u16x2") +ENTRY_REWRITE("__vibmax_u32") +ENTRY_REWRITE("__vibmin_s16x2") +ENTRY_REWRITE("__vibmin_s32") +ENTRY_REWRITE("__vibmin_u16x2") +ENTRY_REWRITE("__vibmin_u32") +ENTRY_REWRITE("__vimax3_s16x2") +ENTRY_REWRITE("__vimax3_s16x2_relu") +ENTRY_REWRITE("__vimax3_s32") +ENTRY_REWRITE("__vimax3_s32_relu") +ENTRY_REWRITE("__vimax3_u16x2") +ENTRY_REWRITE("__vimax3_u32") +ENTRY_REWRITE("__vimax_s16x2_relu") +ENTRY_REWRITE("__vimax_s32_relu") +ENTRY_REWRITE("__vimin3_s16x2") +ENTRY_REWRITE("__vimin3_s16x2_relu") +ENTRY_REWRITE("__vimin3_s32") +ENTRY_REWRITE("__vimin3_s32_relu") +ENTRY_REWRITE("__vimin3_u16x2") +ENTRY_REWRITE("__vimin3_u32") +ENTRY_REWRITE("__vimin_s16x2_relu") +ENTRY_REWRITE("__vimin_s32_relu") ENTRY_REWRITE("__vmaxs2") ENTRY_REWRITE("__vmaxs4") ENTRY_REWRITE("__vmaxu2") diff --git a/clang/lib/DPCT/RulesLang/Math/RewriterSIMDIntrinsics.cpp b/clang/lib/DPCT/RulesLang/Math/RewriterSIMDIntrinsics.cpp index 2e760f5f3c68..330f351b4d1b 100644 --- a/clang/lib/DPCT/RulesLang/Math/RewriterSIMDIntrinsics.cpp +++ b/clang/lib/DPCT/RulesLang/Math/RewriterSIMDIntrinsics.cpp @@ -847,6 +847,652 @@ RewriterMap dpct::createSIMDIntrinsicsRewriterMap() { MapNames::getClNamespace() + "uchar4>", ARG(0), ARG(1), LITERAL(MapNames::getDpctNamespace() + "hadd()"))))) + // __viaddmax_s16x2 + MATH_API_REWRITERS_V2( + "__viaddmax_s16x2", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__viaddmax_s16x2", + CALL(MapNames::getClNamespace() + + "ext::intel::math::viaddmax_s16x2", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__viaddmax_s16x2", + CALL(MapNames::getDpctNamespace() + "vectorized_ternary<" + + MapNames::getClNamespace() + "short2>", + ARG(0), ARG(1), ARG(2), LITERAL("std::plus<>()"), + LITERAL(MapNames::getDpctNamespace() + "maximum()"))))) + // __viaddmax_s16x2_relu + MATH_API_REWRITERS_V2( + "__viaddmax_s16x2_relu", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__viaddmax_s16x2_relu", + CALL(MapNames::getClNamespace() + + "ext::intel::math::viaddmax_s16x2_relu", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__viaddmax_s16x2_relu", + CALL(MapNames::getDpctNamespace() + "vectorized_ternary<" + + MapNames::getClNamespace() + "short2>", + ARG(0), ARG(1), ARG(2), LITERAL("std::plus<>()"), + LITERAL(MapNames::getDpctNamespace() + "maximum()"), + LITERAL("true"))))) + // __viaddmax_s32 + MATH_API_REWRITERS_V2( + "__viaddmax_s32", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY("__viaddmax_s32", + CALL(MapNames::getClNamespace() + + "ext::intel::math::viaddmax_s32", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__viaddmax_s32", + CALL(MapNames::getClNamespace() + "max", + BO(BinaryOperatorKind::BO_Add, ARG(0), ARG(1)), + ARG(2))))) + // __viaddmax_s32_relu + MATH_API_REWRITERS_V2( + "__viaddmax_s32_relu", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__viaddmax_s32_relu", + CALL(MapNames::getClNamespace() + + "ext::intel::math::viaddmax_s32_relu", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__viaddmax_s32_relu", + CALL(MapNames::getDpctNamespace() + "relu", + CALL(MapNames::getClNamespace() + "max", + BO(BinaryOperatorKind::BO_Add, ARG(0), ARG(1)), + ARG(2)))))) + // __viaddmax_u16x2 + MATH_API_REWRITERS_V2( + "__viaddmax_u16x2", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__viaddmax_u16x2", + CALL(MapNames::getClNamespace() + + "ext::intel::math::viaddmax_u16x2", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__viaddmax_u16x2", + CALL(MapNames::getDpctNamespace() + "vectorized_ternary<" + + MapNames::getClNamespace() + "ushort2>", + ARG(0), ARG(1), ARG(2), LITERAL("std::plus<>()"), + LITERAL(MapNames::getDpctNamespace() + "maximum()"))))) + // __viaddmax_u32 + MATH_API_REWRITERS_V2( + "__viaddmax_u32", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__viaddmax_u32", + CALL(MapNames::getClNamespace() + + "ext::intel::math::viaddmax_u32", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__viaddmax_u32", + CALL(MapNames::getClNamespace() + "max", + BO(BinaryOperatorKind::BO_Add, ARG(0), ARG(1)), + ARG(2))))) + // __viaddmin_s16x2 + MATH_API_REWRITERS_V2( + "__viaddmin_s16x2", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__viaddmin_s16x2", + CALL(MapNames::getClNamespace() + + "ext::intel::math::viaddmin_s16x2", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__viaddmin_s16x2", + CALL(MapNames::getDpctNamespace() + "vectorized_ternary<" + + MapNames::getClNamespace() + "short2>", + ARG(0), ARG(1), ARG(2), LITERAL("std::plus<>()"), + LITERAL(MapNames::getDpctNamespace() + "minimum()"))))) + // __viaddmin_s16x2_relu + MATH_API_REWRITERS_V2( + "__viaddmin_s16x2_relu", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__viaddmin_s16x2_relu", + CALL(MapNames::getClNamespace() + + "ext::intel::math::viaddmin_s16x2_relu", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__viaddmin_s16x2_relu", + CALL(MapNames::getDpctNamespace() + "vectorized_ternary<" + + MapNames::getClNamespace() + "short2>", + ARG(0), ARG(1), ARG(2), LITERAL("std::plus<>()"), + LITERAL(MapNames::getDpctNamespace() + "minimum()"), + LITERAL("true"))))) + // __viaddmin_s32 + MATH_API_REWRITERS_V2( + "__viaddmin_s32", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY("__viaddmin_s32", + CALL(MapNames::getClNamespace() + + "ext::intel::math::viaddmin_s32", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__viaddmin_s32", + CALL(MapNames::getClNamespace() + "min", + BO(BinaryOperatorKind::BO_Add, ARG(0), ARG(1)), + ARG(2))))) + // __viaddmin_s32_relu + MATH_API_REWRITERS_V2( + "__viaddmin_s32_relu", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__viaddmin_s32_relu", + CALL(MapNames::getClNamespace() + + "ext::intel::math::viaddmin_s32_relu", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__viaddmin_s32_relu", + CALL(MapNames::getDpctNamespace() + "relu", + CALL(MapNames::getClNamespace() + "min", + BO(BinaryOperatorKind::BO_Add, ARG(0), ARG(1)), + ARG(2)))))) + // __viaddmin_u16x2 + MATH_API_REWRITERS_V2( + "__viaddmin_u16x2", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__viaddmin_u16x2", + CALL(MapNames::getClNamespace() + + "ext::intel::math::viaddmin_u16x2", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__viaddmin_u16x2", + CALL(MapNames::getDpctNamespace() + "vectorized_ternary<" + + MapNames::getClNamespace() + "ushort2>", + ARG(0), ARG(1), ARG(2), LITERAL("std::plus<>()"), + LITERAL(MapNames::getDpctNamespace() + "minimum()"))))) + // __viaddmin_u32 + MATH_API_REWRITERS_V2( + "__viaddmin_u32", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__viaddmin_u32", + CALL(MapNames::getClNamespace() + + "ext::intel::math::viaddmin_u32", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__viaddmin_u32", + CALL(MapNames::getClNamespace() + "min", + BO(BinaryOperatorKind::BO_Add, ARG(0), ARG(1)), + ARG(2))))) + // __vibmax_s16x2 + MATH_API_REWRITERS_V2( + "__vibmax_s16x2", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vibmax_s16x2", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vibmax_s16x2", + ARG(0), ARG(1), ARG(2), ARG(3)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vibmax_s16x2", + CALL(MapNames::getDpctNamespace() + + "vectorized_binary_with_pred", + ARG(0), ARG(1), + LITERAL(MapNames::getDpctNamespace() + "maximum()"), + ARG(2), ARG(3))))) + // __vibmax_s32 + MATH_API_REWRITERS_V2( + "__vibmax_s32", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY("__vibmax_s32", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vibmax_s32", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vibmax_s32", + CALL(MapNames::getDpctNamespace() + "maximum()", ARG(0), + ARG(1), ARG(2))))) + // __vibmax_u16x2 + MATH_API_REWRITERS_V2( + "__vibmax_u16x2", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vibmax_u16x2", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vibmax_u16x2", + ARG(0), ARG(1), ARG(2), ARG(3)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vibmax_u16x2", + CALL(MapNames::getDpctNamespace() + + "vectorized_binary_with_pred", + ARG(0), ARG(1), + LITERAL(MapNames::getDpctNamespace() + "maximum()"), + ARG(2), ARG(3))))) + // __vibmax_u32 + MATH_API_REWRITERS_V2( + "__vibmax_u32", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vibmax_u32", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vibmax_u32", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vibmax_u32", + CALL(MapNames::getDpctNamespace() + "maximum()", ARG(0), + ARG(1), ARG(2))))) + // __vibmin_s16x2 + MATH_API_REWRITERS_V2( + "__vibmin_s16x2", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vibmin_s16x2", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vibmin_s16x2", + ARG(0), ARG(1), ARG(2), ARG(3)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vibmin_s16x2", + CALL(MapNames::getDpctNamespace() + + "vectorized_binary_with_pred", + ARG(0), ARG(1), + LITERAL(MapNames::getDpctNamespace() + "minimum()"), + ARG(2), ARG(3))))) + // __vibmin_s32 + MATH_API_REWRITERS_V2( + "__vibmin_s32", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY("__vibmin_s32", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vibmin_s32", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vibmin_s32", + CALL(MapNames::getDpctNamespace() + "minimum()", ARG(0), + ARG(1), ARG(2))))) + // __vibmin_u16x2 + MATH_API_REWRITERS_V2( + "__vibmin_u16x2", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vibmin_u16x2", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vibmin_u16x2", + ARG(0), ARG(1), ARG(2), ARG(3)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vibmin_u16x2", + CALL(MapNames::getDpctNamespace() + + "vectorized_binary_with_pred", + ARG(0), ARG(1), + LITERAL(MapNames::getDpctNamespace() + "minimum()"), + ARG(2), ARG(3))))) + // __vibmin_u32 + MATH_API_REWRITERS_V2( + "__vibmin_u32", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vibmin_u32", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vibmin_u32", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vibmin_u32", + CALL(MapNames::getDpctNamespace() + "minimum()", ARG(0), + ARG(1), ARG(2))))) + // __vimax3_s16x2 + MATH_API_REWRITERS_V2( + "__vimax3_s16x2", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vimax3_s16x2", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vimax3_s16x2", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vimax3_s16x2", + CALL(MapNames::getDpctNamespace() + "vectorized_ternary<" + + MapNames::getClNamespace() + "short2>", + ARG(0), ARG(1), ARG(2), + LITERAL(MapNames::getDpctNamespace() + "maximum()"), + LITERAL(MapNames::getDpctNamespace() + "maximum()"))))) + // __vimax3_s16x2_relu + MATH_API_REWRITERS_V2( + "__vimax3_s16x2_relu", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vimax3_s16x2_relu", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vimax3_s16x2_relu", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vimax3_s16x2_relu", + CALL(MapNames::getDpctNamespace() + "vectorized_ternary<" + + MapNames::getClNamespace() + "short2>", + ARG(0), ARG(1), ARG(2), + LITERAL(MapNames::getDpctNamespace() + "maximum()"), + LITERAL(MapNames::getDpctNamespace() + "maximum()"), + LITERAL("true"))))) + // __vimax3_s32 + MATH_API_REWRITERS_V2( + "__vimax3_s32", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY("__vimax3_s32", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vimax3_s32", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vimax3_s32", + CALL(MapNames::getClNamespace() + "max", + CALL(MapNames::getClNamespace() + "max", ARG(0), + ARG(1)), + ARG(2))))) + // __vimax3_s32_relu + MATH_API_REWRITERS_V2( + "__vimax3_s32_relu", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vimax3_s32_relu", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vimax3_s32_relu", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vimax3_s32_relu", + CALL(MapNames::getDpctNamespace() + "relu", + CALL(MapNames::getClNamespace() + "max", + CALL(MapNames::getClNamespace() + "max", + ARG(0), ARG(1)), + ARG(2)))))) + // __vimax3_u16x2 + MATH_API_REWRITERS_V2( + "__vimax3_u16x2", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vimax3_u16x2", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vimax3_u16x2", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vimax3_u16x2", + CALL(MapNames::getDpctNamespace() + "vectorized_ternary<" + + MapNames::getClNamespace() + "ushort2>", + ARG(0), ARG(1), ARG(2), + LITERAL(MapNames::getDpctNamespace() + "maximum()"), + LITERAL(MapNames::getDpctNamespace() + "maximum()"))))) + // __vimax3_u32 + MATH_API_REWRITERS_V2( + "__vimax3_u32", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vimax3_u32", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vimax3_u32", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vimax3_u32", + CALL(MapNames::getClNamespace() + "max", + CALL(MapNames::getClNamespace() + "max", + ARG(0), ARG(1)), + ARG(2))))) + // __vimax_s16x2_relu + MATH_API_REWRITERS_V2( + "__vimax_s16x2_relu", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vimax_s16x2_relu", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vimax_s16x2_relu", + ARG(0), ARG(1)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vimax_s16x2_relu", + CALL(MapNames::getDpctNamespace() + "vectorized_binary<" + + MapNames::getClNamespace() + "short2>", + ARG(0), ARG(1), + LITERAL(MapNames::getDpctNamespace() + "maximum()"), + LITERAL("true"))))) + // __vimax_s32_relu + MATH_API_REWRITERS_V2( + "__vimax_s32_relu", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vimax_s32_relu", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vimax_s32_relu", + ARG(0), ARG(1)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vimax_s32_relu", + CALL(MapNames::getDpctNamespace() + "relu", + CALL(MapNames::getClNamespace() + "max", ARG(0), + ARG(1)))))) + // __vimin3_s16x2 + MATH_API_REWRITERS_V2( + "__vimin3_s16x2", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vimin3_s16x2", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vimin3_s16x2", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vimin3_s16x2", + CALL(MapNames::getDpctNamespace() + "vectorized_ternary<" + + MapNames::getClNamespace() + "short2>", + ARG(0), ARG(1), ARG(2), + LITERAL(MapNames::getDpctNamespace() + "minimum()"), + LITERAL(MapNames::getDpctNamespace() + "minimum()"))))) + // __vimin3_s16x2_relu + MATH_API_REWRITERS_V2( + "__vimin3_s16x2_relu", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vimin3_s16x2_relu", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vimin3_s16x2_relu", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vimin3_s16x2_relu", + CALL(MapNames::getDpctNamespace() + "vectorized_ternary<" + + MapNames::getClNamespace() + "short2>", + ARG(0), ARG(1), ARG(2), + LITERAL(MapNames::getDpctNamespace() + "minimum()"), + LITERAL(MapNames::getDpctNamespace() + "minimum()"), + LITERAL("true"))))) + // __vimin3_s32 + MATH_API_REWRITERS_V2( + "__vimin3_s32", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY("__vimin3_s32", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vimin3_s32", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vimin3_s32", + CALL(MapNames::getClNamespace() + "min", + CALL(MapNames::getClNamespace() + "min", ARG(0), + ARG(1)), + ARG(2))))) + // __vimin3_s32_relu + MATH_API_REWRITERS_V2( + "__vimin3_s32_relu", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vimin3_s32_relu", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vimin3_s32_relu", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vimin3_s32_relu", + CALL(MapNames::getDpctNamespace() + "relu", + CALL(MapNames::getClNamespace() + "min", + CALL(MapNames::getClNamespace() + "min", + ARG(0), ARG(1)), + ARG(2)))))) + // __vimin3_u16x2 + MATH_API_REWRITERS_V2( + "__vimin3_u16x2", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vimin3_u16x2", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vimin3_u16x2", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vimin3_u16x2", + CALL(MapNames::getDpctNamespace() + "vectorized_ternary<" + + MapNames::getClNamespace() + "ushort2>", + ARG(0), ARG(1), ARG(2), + LITERAL(MapNames::getDpctNamespace() + "minimum()"), + LITERAL(MapNames::getDpctNamespace() + "minimum()"))))) + // __vimin3_u32 + MATH_API_REWRITERS_V2( + "__vimin3_u32", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vimin3_u32", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vimin3_u32", + ARG(0), ARG(1), ARG(2)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vimin3_u32", + CALL(MapNames::getClNamespace() + "min", + CALL(MapNames::getClNamespace() + "min", + ARG(0), ARG(1)), + ARG(2))))) + // __vimin_s16x2_relu + MATH_API_REWRITERS_V2( + "__vimin_s16x2_relu", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vimin_s16x2_relu", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vimin_s16x2_relu", + ARG(0), ARG(1)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vimin_s16x2_relu", + CALL(MapNames::getDpctNamespace() + "vectorized_binary<" + + MapNames::getClNamespace() + "short2>", + ARG(0), ARG(1), + LITERAL(MapNames::getDpctNamespace() + "minimum()"), + LITERAL("true"))))) + // __vimin_s32_relu + MATH_API_REWRITERS_V2( + "__vimin_s32_relu", + MATH_API_REWRITER_PAIR( + math::Tag::math_libdevice, + CALL_FACTORY_ENTRY( + "__vimin_s32_relu", + CALL(MapNames::getClNamespace() + + "ext::intel::math::vimin_s32_relu", + ARG(0), ARG(1)))), + MATH_API_REWRITER_PAIR( + math::Tag::emulation, + CALL_FACTORY_ENTRY( + "__vimin_s32_relu", + CALL(MapNames::getDpctNamespace() + "relu", + CALL(MapNames::getClNamespace() + "min", ARG(0), + ARG(1)))))) // __vmaxs2 MATH_API_REWRITERS_V2( "__vmaxs2", diff --git a/clang/lib/DPCT/SrcAPI/APINames.inc b/clang/lib/DPCT/SrcAPI/APINames.inc index f2fa3be225ca..71e5de2974ba 100644 --- a/clang/lib/DPCT/SrcAPI/APINames.inc +++ b/clang/lib/DPCT/SrcAPI/APINames.inc @@ -1309,42 +1309,42 @@ ENTRY(__vcmpne2, __vcmpne2, true, NO_FLAG, P4, "Successful") ENTRY(__vcmpne4, __vcmpne4, true, NO_FLAG, P4, "Successful") ENTRY(__vhaddu2, __vhaddu2, true, NO_FLAG, P4, "Successful") ENTRY(__vhaddu4, __vhaddu4, true, NO_FLAG, P4, "Successful") -ENTRY(__viaddmax_s16x2, __viaddmax_s16x2, false, NO_FLAG, P4, "comment") -ENTRY(__viaddmax_s16x2_relu, __viaddmax_s16x2_relu, false, NO_FLAG, P4, "comment") -ENTRY(__viaddmax_s32, __viaddmax_s32, false, NO_FLAG, P4, "comment") -ENTRY(__viaddmax_s32_relu, __viaddmax_s32_relu, false, NO_FLAG, P4, "comment") -ENTRY(__viaddmax_u16x2, __viaddmax_u16x2, false, NO_FLAG, P4, "comment") -ENTRY(__viaddmax_u32, __viaddmax_u32, false, NO_FLAG, P4, "comment") -ENTRY(__viaddmin_s16x2, __viaddmin_s16x2, false, NO_FLAG, P4, "comment") -ENTRY(__viaddmin_s16x2_relu, __viaddmin_s16x2_relu, false, NO_FLAG, P4, "comment") -ENTRY(__viaddmin_s32, __viaddmin_s32, false, NO_FLAG, P4, "comment") -ENTRY(__viaddmin_s32_relu, __viaddmin_s32_relu, false, NO_FLAG, P4, "comment") -ENTRY(__viaddmin_u16x2, __viaddmin_u16x2, false, NO_FLAG, P4, "comment") -ENTRY(__viaddmin_u32, __viaddmin_u32, false, NO_FLAG, P4, "comment") -ENTRY(__vibmax_s16x2, __vibmax_s16x2, false, NO_FLAG, P4, "comment") -ENTRY(__vibmax_s32, __vibmax_s32, false, NO_FLAG, P4, "comment") -ENTRY(__vibmax_u16x2, __vibmax_u16x2, false, NO_FLAG, P4, "comment") -ENTRY(__vibmax_u32, __vibmax_u32, false, NO_FLAG, P4, "comment") -ENTRY(__vibmin_s16x2, __vibmin_s16x2, false, NO_FLAG, P4, "comment") -ENTRY(__vibmin_s32, __vibmin_s32, false, NO_FLAG, P4, "comment") -ENTRY(__vibmin_u16x2, __vibmin_u16x2, false, NO_FLAG, P4, "comment") -ENTRY(__vibmin_u32, __vibmin_u32, false, NO_FLAG, P4, "comment") -ENTRY(__vimax3_s16x2, __vimax3_s16x2, false, NO_FLAG, P4, "comment") -ENTRY(__vimax3_s16x2_relu, __vimax3_s16x2_relu, false, NO_FLAG, P4, "comment") -ENTRY(__vimax3_s32, __vimax3_s32, false, NO_FLAG, P4, "comment") -ENTRY(__vimax3_s32_relu, __vimax3_s32_relu, false, NO_FLAG, P4, "comment") -ENTRY(__vimax3_u16x2, __vimax3_u16x2, false, NO_FLAG, P4, "comment") -ENTRY(__vimax3_u32, __vimax3_u32, false, NO_FLAG, P4, "comment") -ENTRY(__vimax_s16x2_relu, __vimax_s16x2_relu, false, NO_FLAG, P4, "comment") -ENTRY(__vimax_s32_relu, __vimax_s32_relu, false, NO_FLAG, P4, "comment") -ENTRY(__vimin3_s16x2, __vimin3_s16x2, false, NO_FLAG, P4, "comment") -ENTRY(__vimin3_s16x2_relu, __vimin3_s16x2_relu, false, NO_FLAG, P4, "comment") -ENTRY(__vimin3_s32, __vimin3_s32, false, NO_FLAG, P4, "comment") -ENTRY(__vimin3_s32_relu, __vimin3_s32_relu, false, NO_FLAG, P4, "comment") -ENTRY(__vimin3_u16x2, __vimin3_u16x2, false, NO_FLAG, P4, "comment") -ENTRY(__vimin3_u32, __vimin3_u32, false, NO_FLAG, P4, "comment") -ENTRY(__vimin_s16x2_relu, __vimin_s16x2_relu, false, NO_FLAG, P4, "comment") -ENTRY(__vimin_s32_relu, __vimin_s32_relu, false, NO_FLAG, P4, "comment") +ENTRY(__viaddmax_s16x2, __viaddmax_s16x2, true, NO_FLAG, P4, "Successful") +ENTRY(__viaddmax_s16x2_relu, __viaddmax_s16x2_relu, true, NO_FLAG, P4, "Successful") +ENTRY(__viaddmax_s32, __viaddmax_s32, true, NO_FLAG, P4, "Successful") +ENTRY(__viaddmax_s32_relu, __viaddmax_s32_relu, true, NO_FLAG, P4, "Successful") +ENTRY(__viaddmax_u16x2, __viaddmax_u16x2, true, NO_FLAG, P4, "Successful") +ENTRY(__viaddmax_u32, __viaddmax_u32, true, NO_FLAG, P4, "Successful") +ENTRY(__viaddmin_s16x2, __viaddmin_s16x2, true, NO_FLAG, P4, "Successful") +ENTRY(__viaddmin_s16x2_relu, __viaddmin_s16x2_relu, true, NO_FLAG, P4, "Successful") +ENTRY(__viaddmin_s32, __viaddmin_s32, true, NO_FLAG, P4, "Successful") +ENTRY(__viaddmin_s32_relu, __viaddmin_s32_relu, true, NO_FLAG, P4, "Successful") +ENTRY(__viaddmin_u16x2, __viaddmin_u16x2, true, NO_FLAG, P4, "Successful") +ENTRY(__viaddmin_u32, __viaddmin_u32, true, NO_FLAG, P4, "Successful") +ENTRY(__vibmax_s16x2, __vibmax_s16x2, true, NO_FLAG, P4, "Successful") +ENTRY(__vibmax_s32, __vibmax_s32, true, NO_FLAG, P4, "Successful") +ENTRY(__vibmax_u16x2, __vibmax_u16x2, true, NO_FLAG, P4, "Successful") +ENTRY(__vibmax_u32, __vibmax_u32, true, NO_FLAG, P4, "Successful") +ENTRY(__vibmin_s16x2, __vibmin_s16x2, true, NO_FLAG, P4, "Successful") +ENTRY(__vibmin_s32, __vibmin_s32, true, NO_FLAG, P4, "Successful") +ENTRY(__vibmin_u16x2, __vibmin_u16x2, true, NO_FLAG, P4, "Successful") +ENTRY(__vibmin_u32, __vibmin_u32, true, NO_FLAG, P4, "Successful") +ENTRY(__vimax3_s16x2, __vimax3_s16x2, true, NO_FLAG, P4, "Successful") +ENTRY(__vimax3_s16x2_relu, __vimax3_s16x2_relu, true, NO_FLAG, P4, "Successful") +ENTRY(__vimax3_s32, __vimax3_s32, true, NO_FLAG, P4, "Successful") +ENTRY(__vimax3_s32_relu, __vimax3_s32_relu, true, NO_FLAG, P4, "Successful") +ENTRY(__vimax3_u16x2, __vimax3_u16x2, true, NO_FLAG, P4, "Successful") +ENTRY(__vimax3_u32, __vimax3_u32, true, NO_FLAG, P4, "Successful") +ENTRY(__vimax_s16x2_relu, __vimax_s16x2_relu, true, NO_FLAG, P4, "Successful") +ENTRY(__vimax_s32_relu, __vimax_s32_relu, true, NO_FLAG, P4, "Successful") +ENTRY(__vimin3_s16x2, __vimin3_s16x2, true, NO_FLAG, P4, "Successful") +ENTRY(__vimin3_s16x2_relu, __vimin3_s16x2_relu, true, NO_FLAG, P4, "Successful") +ENTRY(__vimin3_s32, __vimin3_s32, true, NO_FLAG, P4, "Successful") +ENTRY(__vimin3_s32_relu, __vimin3_s32_relu, true, NO_FLAG, P4, "Successful") +ENTRY(__vimin3_u16x2, __vimin3_u16x2, true, NO_FLAG, P4, "Successful") +ENTRY(__vimin3_u32, __vimin3_u32, true, NO_FLAG, P4, "Successful") +ENTRY(__vimin_s16x2_relu, __vimin_s16x2_relu, true, NO_FLAG, P4, "Successful") +ENTRY(__vimin_s32_relu, __vimin_s32_relu, true, NO_FLAG, P4, "Successful") ENTRY(__vmaxs2, __vmaxs2, true, NO_FLAG, P4, "Successful") ENTRY(__vmaxs4, __vmaxs4, true, NO_FLAG, P4, "Successful") ENTRY(__vmaxu2, __vmaxu2, true, NO_FLAG, P4, "Successful") diff --git a/clang/runtime/dpct-rt/include/dpct/math.hpp b/clang/runtime/dpct-rt/include/dpct/math.hpp index e14408fbf5d5..d049298a68bf 100644 --- a/clang/runtime/dpct-rt/include/dpct/math.hpp +++ b/clang/runtime/dpct-rt/include/dpct/math.hpp @@ -481,8 +481,12 @@ template inline T relu(T a) { else return a < zero ? zero : a; } -template inline sycl::vec relu(const sycl::vec a) { - return {relu(a[0]), relu(a[1])}; +template +inline sycl::vec relu(const sycl::vec a) { + sycl::vec ret; + for (int i = 0; i < N; ++i) + ret[i] = relu(a[i]); + return ret; } template inline sycl::marray relu(const sycl::marray a) { return {relu(a[0]), relu(a[1])}; @@ -610,6 +614,10 @@ struct maximum { template auto operator()(const T x, const T y) const { return sycl::max(x, y); } + template + auto operator()(const T x, const T y, bool *pred) const { + return (x >= y) ? ((*pred = true), x) : ((*pred = false), y); + } }; /// A sycl::min wrapper functors. @@ -617,6 +625,10 @@ struct minimum { template auto operator()(const T x, const T y) const { return sycl::min(x, y); } + template + auto operator()(const T x, const T y, bool *pred) const { + return (x <= y) ? ((*pred = true), x) : ((*pred = false), y); + } }; /// A sycl::sub_sat wrapper functors. @@ -655,19 +667,47 @@ struct average { /// \tparam [in] BinaryOperation The binary operation class /// \param [in] a The first value /// \param [in] b The second value +/// \param [in] binary_op The operation to do with the two values +/// \param [in] need_relu Whether the result need relu saturation /// \returns The vectorized binary operation value of the two values template inline unsigned vectorized_binary(unsigned a, unsigned b, - const BinaryOperation binary_op) { + const BinaryOperation binary_op, + bool need_relu = false) { sycl::vec v0{a}, v1{b}; auto v2 = v0.as(); auto v3 = v1.as(); auto v4 = detail::vectorized_binary()(v2, v3, binary_op); + if (need_relu) + v4 = relu(v4); v0 = v4.template as>(); return v0; } +/// Compute vectorized binary operation value with pred for two values, with +/// each value treated as a 2 \p T type elements vector type. +/// +/// \tparam [in] T The type of elements type of the vector +/// \tparam [in] BinaryOperation The binary operation class +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] binary_op The operation with pred to do with the two values +/// \param [out] pred_hi The pred pointer that pass into high halfword operation +/// \param [out] pred_lo The pred pointer that pass into low halfword operation +/// \returns The vectorized binary operation value of the two values +template +inline unsigned vectorized_binary_with_pred(unsigned a, unsigned b, + const BinaryOperation binary_op, + bool *pred_hi, bool *pred_lo) { + auto v1 = sycl::vec(a).as>(); + auto v2 = sycl::vec(b).as>(); + sycl::vec ret; + ret[0] = binary_op(v1[0], v2[0], pred_lo); + ret[1] = binary_op(v1[1], v2[1], pred_hi); + return ret.template as>(); +} + /// Compute vectorized isgreater for two values, with each value treated as a /// vector type \p S. /// \tparam [in] S The type of the vector @@ -751,6 +791,35 @@ inline unsigned vectorized_sum_abs_diff(unsigned a, unsigned b) { return sum; } +/// Compute two vectorized binary operation value with pred for three values, +/// with each value treated as a 2 \p T type elements vector type. +/// +/// \tparam [in] VecT The type of the vector +/// \tparam [in] BinaryOperation1 The first binary operation class +/// \tparam [in] BinaryOperation2 The second binary operation class +/// \param [in] a The first value +/// \param [in] b The second value +/// \param [in] c The third value +/// \param [in] binary_op1 The first operation to do with the first two values +/// \param [in] binary_op2 The second operation to do with the third values +/// \param [in] need_relu Whether the result need relu saturation +/// \returns The two vectorized binary operation value of the three values +template +inline unsigned vectorized_ternary(unsigned a, unsigned b, unsigned c, + const BinaryOperation1 binary_op1, + const BinaryOperation2 binary_op2, + bool need_relu = false) { + const auto v1 = sycl::vec(a).as(); + const auto v2 = sycl::vec(b).as(); + const auto v3 = sycl::vec(c).as(); + auto v4 = + detail::vectorized_binary()(v1, v2, binary_op1); + v4 = detail::vectorized_binary()(v4, v3, binary_op2); + if (need_relu) + v4 = relu(v4); + return v4.template as>(); +} + namespace detail { /// Extend the 'val' to 'bit' size, zero extend for unsigned int and signed /// extend for signed int. diff --git a/clang/test/dpct/math/cuda-math-extension-cuda12-after.cu b/clang/test/dpct/math/cuda-math-extension-cuda12-after.cu new file mode 100644 index 000000000000..b6409d7ea3dc --- /dev/null +++ b/clang/test/dpct/math/cuda-math-extension-cuda12-after.cu @@ -0,0 +1,85 @@ +// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2, cuda-11.0, cuda-11.1, cuda-11.2, cuda-11.3, cuda-11.4, cuda-11.5, cuda-11.6, cuda-11.7, cuda-11.8 +// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2, v11.0, v11.1, v11.2, v11.3, v11.4, v11.5, v11.6, v11.7, v11.8 +// RUN: dpct --format-range=none --use-dpcpp-extensions=intel_device_math -out-root %T/math/cuda-math-extension-cuda12-after %s --cuda-include-path="%cuda-path/include" -- -x cuda --cuda-host-only --std=c++14 +// RUN: FileCheck --input-file %T/math/cuda-math-extension-cuda12-after/cuda-math-extension-cuda12-after.dp.cpp --match-full-lines %s +// RUN: %if build_lit %{icpx -c -fsycl %T/math/cuda-math-extension-cuda12-after/cuda-math-extension-cuda12-after.dp.cpp -o %T/math/cuda-math-extension-cuda12-after/cuda-math-extension-cuda12-after.dp.o %} + +__global__ void kernelFuncSIMD() { + unsigned int u, u_1, u_2, u_3; + int i, i_1, i_2, i_3; + bool b_1, b_2; + // CHECK: u_3 = sycl::ext::intel::math::viaddmax_s16x2(u, u_1, u_2); + u_3 = __viaddmax_s16x2(u, u_1, u_2); + // CHECK: u_3 = sycl::ext::intel::math::viaddmax_s16x2_relu(u, u_1, u_2); + u_3 = __viaddmax_s16x2_relu(u, u_1, u_2); + // CHECK: i_3 = sycl::ext::intel::math::viaddmax_s32(i, i_1, i_2); + i_3 = __viaddmax_s32(i, i_1, i_2); + // CHECK: i_3 = sycl::ext::intel::math::viaddmax_s32_relu(i, i_1, i_2); + i_3 = __viaddmax_s32_relu(i, i_1, i_2); + // CHECK: u_3 = sycl::ext::intel::math::viaddmax_u16x2(u, u_1, u_2); + u_3 = __viaddmax_u16x2(u, u_1, u_2); + // CHECK: u_3 = sycl::ext::intel::math::viaddmax_u32(u, u_1, u_2); + u_3 = __viaddmax_u32(u, u_1, u_2); + // CHECK: u_3 = sycl::ext::intel::math::viaddmin_s16x2(u, u_1, u_2); + u_3 = __viaddmin_s16x2(u, u_1, u_2); + // CHECK: u_3 = sycl::ext::intel::math::viaddmin_s16x2_relu(u, u_1, u_2); + u_3 = __viaddmin_s16x2_relu(u, u_1, u_2); + // CHECK: i_3 = sycl::ext::intel::math::viaddmin_s32(i, i_1, i_2); + i_3 = __viaddmin_s32(i, i_1, i_2); + // CHECK: i_3 = sycl::ext::intel::math::viaddmin_s32_relu(i, i_1, i_2); + i_3 = __viaddmin_s32_relu(i, i_1, i_2); + // CHECK: u_3 = sycl::ext::intel::math::viaddmin_u16x2(u, u_1, u_2); + u_3 = __viaddmin_u16x2(u, u_1, u_2); + // CHECK: u_3 = sycl::ext::intel::math::viaddmin_u32(u, u_1, u_2); + u_3 = __viaddmin_u32(u, u_1, u_2); + // CHECK: u_2 = sycl::ext::intel::math::vibmax_s16x2(u, u_1, &b_1, &b_2); + u_2 = __vibmax_s16x2(u, u_1, &b_1, &b_2); + // CHECK: i_2 = sycl::ext::intel::math::vibmax_s32(i, i_1, &b_1); + i_2 = __vibmax_s32(i, i_1, &b_1); + // CHECK: u_2 = sycl::ext::intel::math::vibmax_u16x2(u, u_1, &b_1, &b_2); + u_2 = __vibmax_u16x2(u, u_1, &b_1, &b_2); + // CHECK: u_2 = sycl::ext::intel::math::vibmax_u32(u, u_1, &b_1); + u_2 = __vibmax_u32(u, u_1, &b_1); + // CHECK: u_2 = sycl::ext::intel::math::vibmin_s16x2(u, u_1, &b_1, &b_2); + u_2 = __vibmin_s16x2(u, u_1, &b_1, &b_2); + // CHECK: i_2 = sycl::ext::intel::math::vibmin_s32(i, i_1, &b_1); + i_2 = __vibmin_s32(i, i_1, &b_1); + // CHECK: u_2 = sycl::ext::intel::math::vibmin_u16x2(u, u_1, &b_1, &b_2); + u_2 = __vibmin_u16x2(u, u_1, &b_1, &b_2); + // CHECK: u_2 = sycl::ext::intel::math::vibmin_u32(u, u_1, &b_1); + u_2 = __vibmin_u32(u, u_1, &b_1); + // CHECK: u_3 = sycl::ext::intel::math::vimax3_s16x2(u, u_1, u_2); + u_3 = __vimax3_s16x2(u, u_1, u_2); + // CHECK: u_3 = sycl::ext::intel::math::vimax3_s16x2_relu(u, u_1, u_2); + u_3 = __vimax3_s16x2_relu(u, u_1, u_2); + // CHECK: i_3 = sycl::ext::intel::math::vimax3_s32(i, i_1, i_2); + i_3 = __vimax3_s32(i, i_1, i_2); + // CHECK: i_3 = sycl::ext::intel::math::vimax3_s32_relu(i, i_1, i_2); + i_3 = __vimax3_s32_relu(i, i_1, i_2); + // CHECK: u_3 = sycl::ext::intel::math::vimax3_u16x2(u, u_1, u_2); + u_3 = __vimax3_u16x2(u, u_1, u_2); + // CHECK: u_3 = sycl::ext::intel::math::vimax3_u32(u, u_1, u_2); + u_3 = __vimax3_u32(u, u_1, u_2); + // CHECK: u_2 = sycl::ext::intel::math::vimax_s16x2_relu(u, u_1); + u_2 = __vimax_s16x2_relu(u, u_1); + // CHECK: i_2 = sycl::ext::intel::math::vimax_s32_relu(i, i_1); + i_2 = __vimax_s32_relu(i, i_1); + // CHECK: u_3 = sycl::ext::intel::math::vimin3_s16x2(u, u_1, u_2); + u_3 = __vimin3_s16x2(u, u_1, u_2); + // CHECK: u_3 = sycl::ext::intel::math::vimin3_s16x2_relu(u, u_1, u_2); + u_3 = __vimin3_s16x2_relu(u, u_1, u_2); + // CHECK: i_3 = sycl::ext::intel::math::vimin3_s32(i, i_1, i_2); + i_3 = __vimin3_s32(i, i_1, i_2); + // CHECK: i_3 = sycl::ext::intel::math::vimin3_s32_relu(i, i_1, i_2); + i_3 = __vimin3_s32_relu(i, i_1, i_2); + // CHECK: u_3 = sycl::ext::intel::math::vimin3_u16x2(u, u_1, u_2); + u_3 = __vimin3_u16x2(u, u_1, u_2); + // CHECK: u_3 = sycl::ext::intel::math::vimin3_u32(u, u_1, u_2); + u_3 = __vimin3_u32(u, u_1, u_2); + // CHECK: u_2 = sycl::ext::intel::math::vimin_s16x2_relu(u, u_1); + u_2 = __vimin_s16x2_relu(u, u_1); + // CHECK: i_2 = sycl::ext::intel::math::vimin_s32_relu(i, i_1); + i_2 = __vimin_s32_relu(i, i_1); +} + +int main() { return 0; } diff --git a/clang/test/dpct/math/cuda-math-intrinsics-cuda12-after.cu b/clang/test/dpct/math/cuda-math-intrinsics-cuda12-after.cu index 57fbda42cd0e..754e89996fe2 100644 --- a/clang/test/dpct/math/cuda-math-intrinsics-cuda12-after.cu +++ b/clang/test/dpct/math/cuda-math-intrinsics-cuda12-after.cu @@ -9,35 +9,114 @@ using namespace std; __global__ void kernelFuncHalf(__half *deviceArrayHalf) { + unsigned u; __half h, h_1, h_2; __half2 h2, h2_1, h2_2; // Half2 Comparison Functions - // CHECK: h2_2 = dpct::compare_mask(h2, h2_1, std::equal_to<>()); - h2_2 = __heq2_mask(h2, h2_1); - // CHECK: h2_2 = dpct::unordered_compare_mask(h2, h2_1, std::equal_to<>()); - h2_2 = __hequ2_mask(h2, h2_1); - // CHECK: h2_2 = dpct::compare_mask(h2, h2_1, std::greater_equal<>()); - h2_2 = __hge2_mask(h2, h2_1); - // CHECK: h2_2 = dpct::unordered_compare_mask(h2, h2_1, std::greater_equal<>()); - h2_2 = __hgeu2_mask(h2, h2_1); - // CHECK: h2_2 = dpct::compare_mask(h2, h2_1, std::greater<>()); - h2_2 = __hgt2_mask(h2, h2_1); - // CHECK: h2_2 = dpct::unordered_compare_mask(h2, h2_1, std::greater<>()); - h2_2 = __hgtu2_mask(h2, h2_1); - // CHECK: h2_2 = dpct::compare_mask(h2, h2_1, std::less_equal<>()); - h2_2 = __hle2_mask(h2, h2_1); - // CHECK: h2_2 = dpct::unordered_compare_mask(h2, h2_1, std::less_equal<>()); - h2_2 = __hleu2_mask(h2, h2_1); - // CHECK: h2_2 = dpct::compare_mask(h2, h2_1, std::less<>()); - h2_2 = __hlt2_mask(h2, h2_1); - // CHECK: h2_2 = dpct::unordered_compare_mask(h2, h2_1, std::less<>()); - h2_2 = __hltu2_mask(h2, h2_1); - // CHECK: h2_2 = dpct::compare_mask(h2, h2_1, std::not_equal_to<>()); - h2_2 = __hne2_mask(h2, h2_1); - // CHECK: h2_2 = dpct::unordered_compare_mask(h2, h2_1, std::not_equal_to<>()); - h2_2 = __hneu2_mask(h2, h2_1); + // CHECK: u = dpct::compare_mask(h2, h2_1, std::equal_to<>()); + u = __heq2_mask(h2, h2_1); + // CHECK: u = dpct::unordered_compare_mask(h2, h2_1, std::equal_to<>()); + u = __hequ2_mask(h2, h2_1); + // CHECK: u = dpct::compare_mask(h2, h2_1, std::greater_equal<>()); + u = __hge2_mask(h2, h2_1); + // CHECK: u = dpct::unordered_compare_mask(h2, h2_1, std::greater_equal<>()); + u = __hgeu2_mask(h2, h2_1); + // CHECK: u = dpct::compare_mask(h2, h2_1, std::greater<>()); + u = __hgt2_mask(h2, h2_1); + // CHECK: u = dpct::unordered_compare_mask(h2, h2_1, std::greater<>()); + u = __hgtu2_mask(h2, h2_1); + // CHECK: u = dpct::compare_mask(h2, h2_1, std::less_equal<>()); + u = __hle2_mask(h2, h2_1); + // CHECK: u = dpct::unordered_compare_mask(h2, h2_1, std::less_equal<>()); + u = __hleu2_mask(h2, h2_1); + // CHECK: u = dpct::compare_mask(h2, h2_1, std::less<>()); + u = __hlt2_mask(h2, h2_1); + // CHECK: u = dpct::unordered_compare_mask(h2, h2_1, std::less<>()); + u = __hltu2_mask(h2, h2_1); + // CHECK: u = dpct::compare_mask(h2, h2_1, std::not_equal_to<>()); + u = __hne2_mask(h2, h2_1); + // CHECK: u = dpct::unordered_compare_mask(h2, h2_1, std::not_equal_to<>()); + u = __hneu2_mask(h2, h2_1); +} + +__global__ void kernelFuncSIMD() { + unsigned int u, u_1, u_2, u_3; + int i, i_1, i_2, i_3; + bool b_1, b_2; + // CHECK: u_3 = dpct::vectorized_ternary(u, u_1, u_2, std::plus<>(), dpct::maximum()); + u_3 = __viaddmax_s16x2(u, u_1, u_2); + // CHECK: u_3 = dpct::vectorized_ternary(u, u_1, u_2, std::plus<>(), dpct::maximum(), true); + u_3 = __viaddmax_s16x2_relu(u, u_1, u_2); + // CHECK: i_3 = sycl::max(i + i_1, i_2); + i_3 = __viaddmax_s32(i, i_1, i_2); + // CHECK: i_3 = dpct::relu(sycl::max(i + i_1, i_2)); + i_3 = __viaddmax_s32_relu(i, i_1, i_2); + // CHECK: u_3 = dpct::vectorized_ternary(u, u_1, u_2, std::plus<>(), dpct::maximum()); + u_3 = __viaddmax_u16x2(u, u_1, u_2); + // CHECK: u_3 = sycl::max(u + u_1, u_2); + u_3 = __viaddmax_u32(u, u_1, u_2); + // CHECK: u_3 = dpct::vectorized_ternary(u, u_1, u_2, std::plus<>(), dpct::minimum()); + u_3 = __viaddmin_s16x2(u, u_1, u_2); + // CHECK: u_3 = dpct::vectorized_ternary(u, u_1, u_2, std::plus<>(), dpct::minimum(), true); + u_3 = __viaddmin_s16x2_relu(u, u_1, u_2); + // CHECK: i_3 = sycl::min(i + i_1, i_2); + i_3 = __viaddmin_s32(i, i_1, i_2); + // CHECK: i_3 = dpct::relu(sycl::min(i + i_1, i_2)); + i_3 = __viaddmin_s32_relu(i, i_1, i_2); + // CHECK: u_3 = dpct::vectorized_ternary(u, u_1, u_2, std::plus<>(), dpct::minimum()); + u_3 = __viaddmin_u16x2(u, u_1, u_2); + // CHECK: u_3 = sycl::min(u + u_1, u_2); + u_3 = __viaddmin_u32(u, u_1, u_2); + // CHECK: u_2 = dpct::vectorized_binary_with_pred(u, u_1, dpct::maximum(), &b_1, &b_2); + u_2 = __vibmax_s16x2(u, u_1, &b_1, &b_2); + // CHECK: i_2 = dpct::maximum()(i, i_1, &b_1); + i_2 = __vibmax_s32(i, i_1, &b_1); + // CHECK: u_2 = dpct::vectorized_binary_with_pred(u, u_1, dpct::maximum(), &b_1, &b_2); + u_2 = __vibmax_u16x2(u, u_1, &b_1, &b_2); + // CHECK: u_2 = dpct::maximum()(u, u_1, &b_1); + u_2 = __vibmax_u32(u, u_1, &b_1); + // CHECK: u_2 = dpct::vectorized_binary_with_pred(u, u_1, dpct::minimum(), &b_1, &b_2); + u_2 = __vibmin_s16x2(u, u_1, &b_1, &b_2); + // CHECK: i_2 = dpct::minimum()(i, i_1, &b_1); + i_2 = __vibmin_s32(i, i_1, &b_1); + // CHECK: u_2 = dpct::vectorized_binary_with_pred(u, u_1, dpct::minimum(), &b_1, &b_2); + u_2 = __vibmin_u16x2(u, u_1, &b_1, &b_2); + // CHECK: u_2 = dpct::minimum()(u, u_1, &b_1); + u_2 = __vibmin_u32(u, u_1, &b_1); + // CHECK: u_3 = dpct::vectorized_ternary(u, u_1, u_2, dpct::maximum(), dpct::maximum()); + u_3 = __vimax3_s16x2(u, u_1, u_2); + // CHECK: u_3 = dpct::vectorized_ternary(u, u_1, u_2, dpct::maximum(), dpct::maximum(), true); + u_3 = __vimax3_s16x2_relu(u, u_1, u_2); + // CHECK: i_3 = sycl::max(sycl::max(i, i_1), i_2); + i_3 = __vimax3_s32(i, i_1, i_2); + // CHECK: i_3 = dpct::relu(sycl::max(sycl::max(i, i_1), i_2)); + i_3 = __vimax3_s32_relu(i, i_1, i_2); + // CHECK: u_3 = dpct::vectorized_ternary(u, u_1, u_2, dpct::maximum(), dpct::maximum()); + u_3 = __vimax3_u16x2(u, u_1, u_2); + // CHECK: u_3 = sycl::max(sycl::max(u, u_1), u_2); + u_3 = __vimax3_u32(u, u_1, u_2); + // CHECK: u_2 = dpct::vectorized_binary(u, u_1, dpct::maximum(), true); + u_2 = __vimax_s16x2_relu(u, u_1); + // CHECK: i_2 = dpct::relu(sycl::max(i, i_1)); + i_2 = __vimax_s32_relu(i, i_1); + // CHECK: u_3 = dpct::vectorized_ternary(u, u_1, u_2, dpct::minimum(), dpct::minimum()); + u_3 = __vimin3_s16x2(u, u_1, u_2); + // CHECK: u_3 = dpct::vectorized_ternary(u, u_1, u_2, dpct::minimum(), dpct::minimum(), true); + u_3 = __vimin3_s16x2_relu(u, u_1, u_2); + // CHECK: i_3 = sycl::min(sycl::min(i, i_1), i_2); + i_3 = __vimin3_s32(i, i_1, i_2); + // CHECK: i_3 = dpct::relu(sycl::min(sycl::min(i, i_1), i_2)); + i_3 = __vimin3_s32_relu(i, i_1, i_2); + // CHECK: u_3 = dpct::vectorized_ternary(u, u_1, u_2, dpct::minimum(), dpct::minimum()); + u_3 = __vimin3_u16x2(u, u_1, u_2); + // CHECK: u_3 = sycl::min(sycl::min(u, u_1), u_2); + u_3 = __vimin3_u32(u, u_1, u_2); + // CHECK: u_2 = dpct::vectorized_binary(u, u_1, dpct::minimum(), true); + u_2 = __vimin_s16x2_relu(u, u_1); + // CHECK: i_2 = dpct::relu(sycl::min(i, i_1)); + i_2 = __vimin_s32_relu(i, i_1); } int main() { return 0; } diff --git a/clang/test/dpct/math/cuda-math-syclcompat-cuda12-after.cu b/clang/test/dpct/math/cuda-math-syclcompat-cuda12-after.cu new file mode 100644 index 000000000000..4e9246a332ae --- /dev/null +++ b/clang/test/dpct/math/cuda-math-syclcompat-cuda12-after.cu @@ -0,0 +1,107 @@ +// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2, cuda-11.0, cuda-11.1, cuda-11.2, cuda-11.3, cuda-11.4, cuda-11.5, cuda-11.6, cuda-11.7, cuda-11.8 +// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2, v11.0, v11.1, v11.2, v11.3, v11.4, v11.5, v11.6, v11.7, v11.8 +// RUN: dpct --format-range=none -use-syclcompat -out-root %T/math/cuda-math-syclcompat-cuda12-after %s --cuda-include-path="%cuda-path/include" -- -x cuda --cuda-host-only --std=c++14 +// RUN: FileCheck --input-file %T/math/cuda-math-syclcompat-cuda12-after/cuda-math-syclcompat-cuda12-after.dp.cpp --match-full-lines %s +// RUN: %if build_lit %{icpx -c -fsycl -DNO_BUILD_TEST %T/math/cuda-math-syclcompat-cuda12-after/cuda-math-syclcompat-cuda12-after.dp.cpp -o %T/math/cuda-math-syclcompat-cuda12-after/cuda-math-syclcompat-cuda12-after.dp.o %} + +#include "cuda_fp16.h" + +using namespace std; + +__global__ void kernelFuncSIMD() { + unsigned int u, u_1, u_2, u_3; + int i, i_1, i_2, i_3; + bool b_1, b_2; +#ifndef NO_BUILD_TEST + // CHECK: u_3 = syclcompat::vectorized_ternary(u, u_1, u_2, std::plus<>(), syclcompat::maximum()); + u_3 = __viaddmax_s16x2(u, u_1, u_2); + // CHECK: u_3 = syclcompat::vectorized_ternary(u, u_1, u_2, std::plus<>(), syclcompat::maximum(), true); + u_3 = __viaddmax_s16x2_relu(u, u_1, u_2); +#endif + // CHECK: i_3 = sycl::max(i + i_1, i_2); + i_3 = __viaddmax_s32(i, i_1, i_2); +#ifndef NO_BUILD_TEST + // CHECK: i_3 = syclcompat::relu(sycl::max(i + i_1, i_2)); + i_3 = __viaddmax_s32_relu(i, i_1, i_2); + // CHECK: u_3 = syclcompat::vectorized_ternary(u, u_1, u_2, std::plus<>(), syclcompat::maximum()); + u_3 = __viaddmax_u16x2(u, u_1, u_2); +#endif + // CHECK: u_3 = sycl::max(u + u_1, u_2); + u_3 = __viaddmax_u32(u, u_1, u_2); +#ifndef NO_BUILD_TEST + // CHECK: u_3 = syclcompat::vectorized_ternary(u, u_1, u_2, std::plus<>(), syclcompat::minimum()); + u_3 = __viaddmin_s16x2(u, u_1, u_2); + // CHECK: u_3 = syclcompat::vectorized_ternary(u, u_1, u_2, std::plus<>(), syclcompat::minimum(), true); + u_3 = __viaddmin_s16x2_relu(u, u_1, u_2); +#endif + // CHECK: i_3 = sycl::min(i + i_1, i_2); + i_3 = __viaddmin_s32(i, i_1, i_2); +#ifndef NO_BUILD_TEST + // CHECK: i_3 = syclcompat::relu(sycl::min(i + i_1, i_2)); + i_3 = __viaddmin_s32_relu(i, i_1, i_2); + // CHECK: u_3 = syclcompat::vectorized_ternary(u, u_1, u_2, std::plus<>(), syclcompat::minimum()); + u_3 = __viaddmin_u16x2(u, u_1, u_2); +#endif + // CHECK: u_3 = sycl::min(u + u_1, u_2); + u_3 = __viaddmin_u32(u, u_1, u_2); +#ifndef NO_BUILD_TEST + // CHECK: u_2 = syclcompat::vectorized_binary_with_pred(u, u_1, syclcompat::maximum(), &b_1, &b_2); + u_2 = __vibmax_s16x2(u, u_1, &b_1, &b_2); + // CHECK: i_2 = syclcompat::maximum()(i, i_1, &b_1); + i_2 = __vibmax_s32(i, i_1, &b_1); + // CHECK: u_2 = syclcompat::vectorized_binary_with_pred(u, u_1, syclcompat::maximum(), &b_1, &b_2); + u_2 = __vibmax_u16x2(u, u_1, &b_1, &b_2); + // CHECK: u_2 = syclcompat::maximum()(u, u_1, &b_1); + u_2 = __vibmax_u32(u, u_1, &b_1); + // CHECK: u_2 = syclcompat::vectorized_binary_with_pred(u, u_1, syclcompat::minimum(), &b_1, &b_2); + u_2 = __vibmin_s16x2(u, u_1, &b_1, &b_2); + // CHECK: i_2 = syclcompat::minimum()(i, i_1, &b_1); + i_2 = __vibmin_s32(i, i_1, &b_1); + // CHECK: u_2 = syclcompat::vectorized_binary_with_pred(u, u_1, syclcompat::minimum(), &b_1, &b_2); + u_2 = __vibmin_u16x2(u, u_1, &b_1, &b_2); + // CHECK: u_2 = syclcompat::minimum()(u, u_1, &b_1); + u_2 = __vibmin_u32(u, u_1, &b_1); + // CHECK: u_3 = syclcompat::vectorized_ternary(u, u_1, u_2, syclcompat::maximum(), syclcompat::maximum()); + u_3 = __vimax3_s16x2(u, u_1, u_2); + // CHECK: u_3 = syclcompat::vectorized_ternary(u, u_1, u_2, syclcompat::maximum(), syclcompat::maximum(), true); + u_3 = __vimax3_s16x2_relu(u, u_1, u_2); +#endif + // CHECK: i_3 = sycl::max(sycl::max(i, i_1), i_2); + i_3 = __vimax3_s32(i, i_1, i_2); +#ifndef NO_BUILD_TEST + // CHECK: i_3 = syclcompat::relu(sycl::max(sycl::max(i, i_1), i_2)); + i_3 = __vimax3_s32_relu(i, i_1, i_2); + // CHECK: u_3 = syclcompat::vectorized_ternary(u, u_1, u_2, syclcompat::maximum(), syclcompat::maximum()); + u_3 = __vimax3_u16x2(u, u_1, u_2); +#endif + // CHECK: u_3 = sycl::max(sycl::max(u, u_1), u_2); + u_3 = __vimax3_u32(u, u_1, u_2); +#ifndef NO_BUILD_TEST + // CHECK: u_2 = syclcompat::vectorized_binary(u, u_1, syclcompat::maximum(), true); + u_2 = __vimax_s16x2_relu(u, u_1); + // CHECK: i_2 = syclcompat::relu(sycl::max(i, i_1)); + i_2 = __vimax_s32_relu(i, i_1); + // CHECK: u_3 = syclcompat::vectorized_ternary(u, u_1, u_2, syclcompat::minimum(), syclcompat::minimum()); + u_3 = __vimin3_s16x2(u, u_1, u_2); + // CHECK: u_3 = syclcompat::vectorized_ternary(u, u_1, u_2, syclcompat::minimum(), syclcompat::minimum(), true); + u_3 = __vimin3_s16x2_relu(u, u_1, u_2); +#endif + // CHECK: i_3 = sycl::min(sycl::min(i, i_1), i_2); + i_3 = __vimin3_s32(i, i_1, i_2); +#ifndef NO_BUILD_TEST + // CHECK: i_3 = syclcompat::relu(sycl::min(sycl::min(i, i_1), i_2)); + i_3 = __vimin3_s32_relu(i, i_1, i_2); + // CHECK: u_3 = syclcompat::vectorized_ternary(u, u_1, u_2, syclcompat::minimum(), syclcompat::minimum()); + u_3 = __vimin3_u16x2(u, u_1, u_2); +#endif + // CHECK: u_3 = sycl::min(sycl::min(u, u_1), u_2); + u_3 = __vimin3_u32(u, u_1, u_2); +#ifndef NO_BUILD_TEST + // CHECK: u_2 = syclcompat::vectorized_binary(u, u_1, syclcompat::minimum(), true); + u_2 = __vimin_s16x2_relu(u, u_1); + // CHECK: i_2 = syclcompat::relu(sycl::min(i, i_1)); + i_2 = __vimin_s32_relu(i, i_1); +#endif +} + +int main() { return 0; }