From 398398becff28608e1ee4c558cc342c12bd37f69 Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi Date: Fri, 19 Sep 2025 16:08:56 +0530 Subject: [PATCH 1/8] [clang][NVPTX] Add intrinsics and builtins for cvt RS rounding mode This change adds LLVM intrinsics and clang builtins for the `cvt` RS rounding mode instruction variants. Tests are added in `convert-sm103a.ll` and verified through ptxas-13.0. --- clang/include/clang/Basic/BuiltinsNVPTX.td | 21 ++ clang/test/CodeGen/builtins-nvptx.c | 83 +++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 33 ++ .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 3 + llvm/lib/Target/NVPTX/NVPTX.h | 1 + llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 44 +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 60 ++++ llvm/test/CodeGen/NVPTX/convert-sm103a.ll | 327 ++++++++++++++++++ 8 files changed, 572 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm103a.ll diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td index 2d6fa1771014d..0f59fc93cede1 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.td +++ b/clang/include/clang/Basic/BuiltinsNVPTX.td @@ -579,11 +579,19 @@ def __nvvm_ff2bf16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float) def __nvvm_ff2bf16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>; def __nvvm_ff2bf16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>; def __nvvm_ff2bf16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>; +def __nvvm_ff2bf16x2_rs : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff2bf16x2_rs_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff2bf16x2_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff2bf16x2_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; def __nvvm_ff2f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>; def __nvvm_ff2f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>; def __nvvm_ff2f16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>; def __nvvm_ff2f16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>; +def __nvvm_ff2f16x2_rs : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff2f16x2_rs_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff2f16x2_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff2f16x2_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; def __nvvm_f2bf16_rn : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; def __nvvm_f2bf16_rn_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; @@ -616,6 +624,11 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>; def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>; +def __nvvm_ff_to_e4m3x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff_to_e4m3x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff_to_e5m2x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff_to_e5m2x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; + def __nvvm_ff_to_e2m3x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_ff_to_e2m3x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_ff_to_e3m2x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; @@ -626,12 +639,20 @@ def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_e2m3x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff_to_e2m3x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff_to_e3m2x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff_to_e3m2x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; + def __nvvm_ff_to_e2m1x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_ff_to_e2m1x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_e2m1x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_e2m1x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_e2m1x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff_to_e2m1x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; + def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index f994adb14e457..70facdbbabb0e 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -43,6 +43,12 @@ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx86 -DPTX=86 \ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_103a -target-feature +ptx87 -DPTX=87 \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX87_SM103a %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 -DPTX=87 \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX87_SM100a %s // ### The last run to check with the highest SM and PTX version available // ### to make sure target builtins are still accepted. // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx87 -DPTX=87 \ @@ -1203,6 +1209,83 @@ __device__ void nvvm_cvt_sm100a_sm101a_sm120a() { // CHECK: ret void } +__device__ void nvvm_cvt_sm100a_sm103a() { +#if (PTX >= 87) && (__CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM103_ALL) + +// CHECK_PTX87_SM100a: call <2 x half> @llvm.nvvm.ff2f16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call <2 x half> @llvm.nvvm.ff2f16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff2f16x2_rs(1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff2f16x2_rs_relu(1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff2f16x2_rs_satfinite(1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff2f16x2_rs_relu_satfinite(1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff2bf16x2_rs(1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff2bf16x2_rs_relu(1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff2bf16x2_rs_satfinite(1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff2bf16x2_rs_relu_satfinite(1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e4m3x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e4m3x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff_to_e4m3x4_rs_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e4m3x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e4m3x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff_to_e4m3x4_rs_relu_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e5m2x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e5m2x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff_to_e5m2x4_rs_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e5m2x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e5m2x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff_to_e5m2x4_rs_relu_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e2m3x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e2m3x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff_to_e2m3x4_rs_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e2m3x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e2m3x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff_to_e2m3x4_rs_relu_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e3m2x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e3m2x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff_to_e3m2x4_rs_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e3m2x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e3m2x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff_to_e3m2x4_rs_relu_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call i16 @llvm.nvvm.ff.to.e2m1x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call i16 @llvm.nvvm.ff.to.e2m1x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff_to_e2m1x4_rs_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: call i16 @llvm.nvvm.ff.to.e2m1x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: call i16 @llvm.nvvm.ff.to.e2m1x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) + __nvvm_ff_to_e2m1x4_rs_relu_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); +#endif +} + #define NAN32 0x7FBFFFFF #define NAN16 (__bf16)0x7FBF #define BF16 (__bf16)0.1f diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 9cfab26fffa54..abc8c9cb8b94e 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1493,6 +1493,17 @@ let TargetPrefix = "nvvm" in { } } + // RS rounding mode conversions for f16x2, bf16x2 types + foreach relu = ["", "_relu"] in { + foreach satfinite = ["", "_satfinite"] in { + def int_nvvm_ff2f16x2_rs # relu # satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty]>; + + def int_nvvm_ff2bf16x2_rs # relu # satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty]>; + } + } + foreach satfinite = ["", "_satfinite"] in { def int_nvvm_f2tf32_rna # satfinite : NVVMBuiltin, PureIntrinsic<[llvm_i32_ty], [llvm_float_ty]>; @@ -1515,6 +1526,14 @@ let TargetPrefix = "nvvm" in { PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; } } + + // RS rounding mode conversions for f8x4 types + foreach type = ["e4m3x4", "e5m2x4"] in { + foreach relu = ["", "_relu"] in { + def int_nvvm_ff_to_ # type # _rs # relu # _satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_v4i8_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i32_ty]>; + } + } // FP4 conversions. foreach relu = ["", "_relu"] in { @@ -1524,6 +1543,12 @@ let TargetPrefix = "nvvm" in { def int_nvvm_e2m1x2_to_f16x2_rn # relu : NVVMBuiltin, PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; } + + // RS rounding mode conversions for f4x4 type + foreach relu = ["", "_relu"] in { + def int_nvvm_ff_to_e2m1x4_rs # relu # _satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i32_ty]>; + } // FP6 conversions. foreach type = ["e2m3x2", "e3m2x2"] in { @@ -1535,6 +1560,14 @@ let TargetPrefix = "nvvm" in { PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; } } + + // RS rounding mode conversions for f6x4 types + foreach type = ["e2m3x4", "e3m2x4"] in { + foreach relu = ["", "_relu"] in { + def int_nvvm_ff_to_ # type # _rs # relu # _satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_v4i8_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i32_ty]>; + } + } // UE8M0x2 conversions. foreach rmode = ["_rz", "_rp"] in { diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index f9bdc09935330..77913f27838e2 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -149,6 +149,9 @@ void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O, case NVPTX::PTXCvtMode::RNA: O << ".rna"; return; + case NVPTX::PTXCvtMode::RS: + O << ".rs"; + return; } } llvm_unreachable("Invalid conversion modifier"); diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 77a0e03d4075a..1e0f747f8f7fc 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -207,6 +207,7 @@ enum CvtMode { RM, RP, RNA, + RS, BASE_MASK = 0x0F, FTZ_FLAG = 0x10, diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 191213f9ad2ee..637410f586a9e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -35,6 +35,7 @@ def CvtRZ : PatLeaf<(i32 0x6)>; def CvtRM : PatLeaf<(i32 0x7)>; def CvtRP : PatLeaf<(i32 0x8)>; def CvtRNA : PatLeaf<(i32 0x9)>; +def CvtRS : PatLeaf<(i32 0xA)>; def CvtNONE_FTZ : PatLeaf<(i32 0x10)>; def CvtRNI_FTZ : PatLeaf<(i32 0x11)>; @@ -52,6 +53,7 @@ def CvtSAT_FTZ : PatLeaf<(i32 0x30)>; def CvtNONE_RELU : PatLeaf<(i32 0x40)>; def CvtRN_RELU : PatLeaf<(i32 0x45)>; def CvtRZ_RELU : PatLeaf<(i32 0x46)>; +def CvtRS_RELU : PatLeaf<(i32 0x4A)>; def CvtMode : Operand { let PrintMethod = "printCvtMode"; @@ -132,6 +134,9 @@ def hasSM100a : Predicate<"Subtarget->getSmVersion() == 100 && Subtarget->hasArc def hasSM101a : Predicate<"Subtarget->getSmVersion() == 101 && Subtarget->hasArchAccelFeatures()">; def hasSM120a : Predicate<"Subtarget->getSmVersion() == 120 && Subtarget->hasArchAccelFeatures()">; +def hasSM100aOrSM103a : + Predicate<"(Subtarget->getSmVersion() == 100 || Subtarget->getSmVersion() == 103) && Subtarget->hasArchAccelFeatures()">; + // non-sync shfl instructions are not available on sm_70+ in PTX6.4+ def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" "&& Subtarget->getPTXVersion() >= 64)">; @@ -592,6 +597,21 @@ let hasSideEffects = false in { defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", B32>; defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", B32>; + + multiclass CVT_FROM_FLOAT_V2_RS { + def _f32_rs : + BasicFlagsNVPTXInst<(outs RC:$dst), + (ins B32:$src1, B32:$src2, B32:$src3), (ins CvtMode:$mode), + "cvt${mode:base}${mode:relu}." # FromName # ".f32">; + + def _f32_rs_sf : + BasicFlagsNVPTXInst<(outs RC:$dst), + (ins B32:$src1, B32:$src2, B32:$src3), (ins CvtMode:$mode), + "cvt${mode:base}${mode:relu}.satfinite." # FromName # ".f32">; + } + + defm CVT_f16x2 : CVT_FROM_FLOAT_V2_RS<"f16x2", B32>; + defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_RS<"bf16x2", B32>; // FP8 conversions. multiclass CVT_TO_F8X2 { @@ -618,6 +638,15 @@ let hasSideEffects = false in { def CVT_f16x2_e4m3x2 : CVT_f16x2_fp8<"e4m3">; def CVT_f16x2_e5m2x2 : CVT_f16x2_fp8<"e5m2">; + + class CVT_TO_FP8X4 : + NVPTXInst<(outs B32:$dst), + (ins B32:$src1, B32:$src2, B32:$src3, B32:$src4, B32:$src5, + CvtMode:$mode), + "cvt${mode:base}${mode:relu}.satfinite." # F8Name # "x4.f32 \t$dst, {{$src1, $src2, $src3, $src4}}, $src5;">; + + def CVT_e4m3x4_f32_rs_sf : CVT_TO_FP8X4<"e4m3">; + def CVT_e5m2x4_f32_rs_sf : CVT_TO_FP8X4<"e5m2">; // Float to TF32 conversions multiclass CVT_TO_TF32 Preds = [hasPTX<78>, hasSM<90>]> { @@ -651,6 +680,15 @@ let hasSideEffects = false in { "cvt${mode:base}${mode:relu}.f16x2." # type>; } + class CVT_TO_FP6X4 : + NVPTXInst<(outs B32:$dst), + (ins B32:$src1, B32:$src2, B32:$src3, B32:$src4, B32:$src5, + CvtMode:$mode), + "cvt${mode:base}${mode:relu}.satfinite." # F6Name # "x4.f32 \t$dst, {{$src1, $src2, $src3, $src4}}, $src5;">; + + def CVT_e2m3x4_f32_rs_sf : CVT_TO_FP6X4<"e2m3">; + def CVT_e3m2x4_f32_rs_sf : CVT_TO_FP6X4<"e3m2">; + // FP4 conversions. def CVT_e2m1x2_f32_sf : NVPTXInst<(outs B16:$dst), (ins B32:$src1, B32:$src2, CvtMode:$mode), @@ -667,6 +705,12 @@ let hasSideEffects = false in { "cvt.u8.u16 \t%e2m1x2_in, $src; \n\t", "cvt${mode:base}${mode:relu}.f16x2.e2m1x2 \t$dst, %e2m1x2_in; \n\t", "}}"), []>; + + def CVT_e2m1x4_f32_rs_sf : + NVPTXInst<(outs B16:$dst), + (ins B32:$src1, B32:$src2, B32:$src3, B32:$src4, B32:$src5, + CvtMode:$mode), + "cvt${mode:base}${mode:relu}.satfinite.e2m1x4.f32 \t$dst, {{$src1, $src2, $src3, $src4}}, $src5;">; // UE8M0x2 conversions. class CVT_f32_to_ue8m0x2 : diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index e91171c1ae38f..16debba5002f8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1782,11 +1782,32 @@ def : Pat<(int_nvvm_ff2bf16x2_rn_relu f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, C def : Pat<(int_nvvm_ff2bf16x2_rz f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRZ)>; def : Pat<(int_nvvm_ff2bf16x2_rz_relu f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRZ_RELU)>; +let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in { +def : Pat<(int_nvvm_ff2bf16x2_rs f32:$a, f32:$b, i32:$c), + (CVT_bf16x2_f32_rs $a, $b, $c, CvtRS)>; +def : Pat<(int_nvvm_ff2bf16x2_rs_relu f32:$a, f32:$b, i32:$c), + (CVT_bf16x2_f32_rs $a, $b, $c, CvtRS_RELU)>; +def : Pat<(int_nvvm_ff2bf16x2_rs_satfinite f32:$a, f32:$b, i32:$c), + (CVT_bf16x2_f32_rs_sf $a, $b, $c, CvtRS)>; +def : Pat<(int_nvvm_ff2bf16x2_rs_relu_satfinite f32:$a, f32:$b, i32:$c), + (CVT_bf16x2_f32_rs_sf $a, $b, $c, CvtRS_RELU)>; +} + def : Pat<(int_nvvm_ff2f16x2_rn f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRN)>; def : Pat<(int_nvvm_ff2f16x2_rn_relu f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRN_RELU)>; def : Pat<(int_nvvm_ff2f16x2_rz f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRZ)>; def : Pat<(int_nvvm_ff2f16x2_rz_relu f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRZ_RELU)>; +let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in { +def : Pat<(int_nvvm_ff2f16x2_rs f32:$a, f32:$b, i32:$c), + (CVT_f16x2_f32_rs $a, $b, $c, CvtRS)>; +def : Pat<(int_nvvm_ff2f16x2_rs_relu f32:$a, f32:$b, i32:$c), + (CVT_f16x2_f32_rs $a, $b, $c, CvtRS_RELU)>; +def : Pat<(int_nvvm_ff2f16x2_rs_satfinite f32:$a, f32:$b, i32:$c), + (CVT_f16x2_f32_rs_sf $a, $b, $c, CvtRS)>; +def : Pat<(int_nvvm_ff2f16x2_rs_relu_satfinite f32:$a, f32:$b, i32:$c), + (CVT_f16x2_f32_rs_sf $a, $b, $c, CvtRS_RELU)>; +} def : Pat<(int_nvvm_f2bf16_rn f32:$a), (CVT_bf16_f32 $a, CvtRN)>; def : Pat<(int_nvvm_f2bf16_rn_relu f32:$a), (CVT_bf16_f32 $a, CvtRN_RELU)>; def : Pat<(int_nvvm_f2bf16_rz f32:$a), (CVT_bf16_f32 $a, CvtRZ)>; @@ -1929,6 +1950,45 @@ let Predicates = [hasPTX<86>, hasSM<100>, hasArchAccelFeatures] in { (CVT_bf16x2_ue8m0x2 $a)>; } +// RS rounding mode conversions +let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in { +// FP8x4 conversions +def : Pat<(int_nvvm_ff_to_e4m3x4_rs_satfinite + f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), + (CVT_e4m3x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS)>; +def : Pat<(int_nvvm_ff_to_e4m3x4_rs_relu_satfinite + f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), + (CVT_e4m3x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS_RELU)>; +def : Pat<(int_nvvm_ff_to_e5m2x4_rs_satfinite + f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), + (CVT_e5m2x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS)>; +def : Pat<(int_nvvm_ff_to_e5m2x4_rs_relu_satfinite + f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), + (CVT_e5m2x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS_RELU)>; + +// FP6x4 conversions +def : Pat<(int_nvvm_ff_to_e2m3x4_rs_satfinite + f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), + (CVT_e2m3x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS)>; +def : Pat<(int_nvvm_ff_to_e2m3x4_rs_relu_satfinite + f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), + (CVT_e2m3x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS_RELU)>; +def : Pat<(int_nvvm_ff_to_e3m2x4_rs_satfinite + f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), + (CVT_e3m2x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS)>; +def : Pat<(int_nvvm_ff_to_e3m2x4_rs_relu_satfinite + f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), + (CVT_e3m2x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS_RELU)>; + +// FP4x4 conversions +def : Pat<(int_nvvm_ff_to_e2m1x4_rs_satfinite + f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), + (CVT_e2m1x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS)>; +def : Pat<(int_nvvm_ff_to_e2m1x4_rs_relu_satfinite + f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), + (CVT_e2m1x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS_RELU)>; +} + // // FNS // diff --git a/llvm/test/CodeGen/NVPTX/convert-sm103a.ll b/llvm/test/CodeGen/NVPTX/convert-sm103a.ll new file mode 100644 index 0000000000000..39a9ed18da71c --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/convert-sm103a.ll @@ -0,0 +1,327 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx87 | FileCheck %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx87 | FileCheck %s +; RUN: %if ptxas-sm_100a && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx87 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_103a && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx87 | %ptxas-verify -arch=sm_103a %} + +; F16X2 conversions + +define <2 x half> @cvt_rs_f16x2_f32(float %f1, float %f2, i32 %rbits) { +; CHECK-LABEL: cvt_rs_f16x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_f16x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_f16x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_f16x2_f32_param_2]; +; CHECK-NEXT: cvt.rs.f16x2.f32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.ff2f16x2.rs(float %f1, float %f2, i32 %rbits) + ret <2 x half> %val +} + +define <2 x half> @cvt_rs_relu_f16x2_f32(float %f1, float %f2, i32 %rbits) { +; CHECK-LABEL: cvt_rs_relu_f16x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_f16x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_f16x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_f16x2_f32_param_2]; +; CHECK-NEXT: cvt.rs.relu.f16x2.f32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float %f1, float %f2, i32 %rbits) + ret <2 x half> %val +} + +define <2 x half> @cvt_rs_sf_f16x2_f32(float %f1, float %f2, i32 %rbits) { +; CHECK-LABEL: cvt_rs_sf_f16x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_sf_f16x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_sf_f16x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_sf_f16x2_f32_param_2]; +; CHECK-NEXT: cvt.rs.satfinite.f16x2.f32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float %f1, float %f2, i32 %rbits) + ret <2 x half> %val +} + +define <2 x half> @cvt_rs_relu_sf_f16x2_f32(float %f1, float %f2, i32 %rbits) { +; CHECK-LABEL: cvt_rs_relu_sf_f16x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_sf_f16x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_sf_f16x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_sf_f16x2_f32_param_2]; +; CHECK-NEXT: cvt.rs.relu.satfinite.f16x2.f32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float %f1, float %f2, i32 %rbits) + ret <2 x half> %val +} + +; BF16X2 conversions + +define <2 x bfloat> @cvt_rs_bf16x2_f32(float %f1, float %f2, i32 %rbits) { +; CHECK-LABEL: cvt_rs_bf16x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_bf16x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_bf16x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_bf16x2_f32_param_2]; +; CHECK-NEXT: cvt.rs.bf16x2.f32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float %f1, float %f2, i32 %rbits) + ret <2 x bfloat> %val +} + +define <2 x bfloat> @cvt_rs_relu_bf16x2_f32(float %f1, float %f2, i32 %rbits) { +; CHECK-LABEL: cvt_rs_relu_bf16x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_bf16x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_bf16x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_bf16x2_f32_param_2]; +; CHECK-NEXT: cvt.rs.relu.bf16x2.f32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float %f1, float %f2, i32 %rbits) + ret <2 x bfloat> %val +} + +define <2 x bfloat> @cvt_rs_sf_bf16x2_f32(float %f1, float %f2, i32 %rbits) { +; CHECK-LABEL: cvt_rs_sf_bf16x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_sf_bf16x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_sf_bf16x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_sf_bf16x2_f32_param_2]; +; CHECK-NEXT: cvt.rs.satfinite.bf16x2.f32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float %f1, float %f2, i32 %rbits) + ret <2 x bfloat> %val +} + +define <2 x bfloat> @cvt_rs_relu_sf_bf16x2_f32(float %f1, float %f2, i32 %rbits) { +; CHECK-LABEL: cvt_rs_relu_sf_bf16x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_sf_bf16x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_sf_bf16x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_sf_bf16x2_f32_param_2]; +; CHECK-NEXT: cvt.rs.relu.satfinite.bf16x2.f32 %r4, %r1, %r2, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float %f1, float %f2, i32 %rbits) + ret <2 x bfloat> %val +} + +; F8X4 conversions + +define <4 x i8> @cvt_rs_sf_e4m3x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +; CHECK-LABEL: cvt_rs_sf_e4m3x4_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_sf_e4m3x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_sf_e4m3x4_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_sf_e4m3x4_f32_param_2]; +; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_sf_e4m3x4_f32_param_3]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e4m3x4_f32_param_4]; +; CHECK-NEXT: cvt.rs.satfinite.e4m3x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %val = call <4 x i8> @llvm.nvvm.ff.to.e4m3x4.rs.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + ret <4 x i8> %val +} + +define <4 x i8> @cvt_rs_relu_sf_e4m3x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +; CHECK-LABEL: cvt_rs_relu_sf_e4m3x4_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_sf_e4m3x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_sf_e4m3x4_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_sf_e4m3x4_f32_param_2]; +; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_relu_sf_e4m3x4_f32_param_3]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e4m3x4_f32_param_4]; +; CHECK-NEXT: cvt.rs.relu.satfinite.e4m3x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %val = call <4 x i8> @llvm.nvvm.ff.to.e4m3x4.rs.relu.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + ret <4 x i8> %val +} + +define <4 x i8> @cvt_rs_sf_e5m2x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +; CHECK-LABEL: cvt_rs_sf_e5m2x4_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_sf_e5m2x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_sf_e5m2x4_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_sf_e5m2x4_f32_param_2]; +; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_sf_e5m2x4_f32_param_3]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e5m2x4_f32_param_4]; +; CHECK-NEXT: cvt.rs.satfinite.e5m2x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %val = call <4 x i8> @llvm.nvvm.ff.to.e5m2x4.rs.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + ret <4 x i8> %val +} + +define <4 x i8> @cvt_rs_relu_sf_e5m2x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +; CHECK-LABEL: cvt_rs_relu_sf_e5m2x4_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_sf_e5m2x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_sf_e5m2x4_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_sf_e5m2x4_f32_param_2]; +; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_relu_sf_e5m2x4_f32_param_3]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e5m2x4_f32_param_4]; +; CHECK-NEXT: cvt.rs.relu.satfinite.e5m2x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %val = call <4 x i8> @llvm.nvvm.ff.to.e5m2x4.rs.relu.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + ret <4 x i8> %val +} + +; F6X4 conversions + +define <4 x i8> @cvt_rs_sf_e2m3x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +; CHECK-LABEL: cvt_rs_sf_e2m3x4_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_sf_e2m3x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_sf_e2m3x4_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_sf_e2m3x4_f32_param_2]; +; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_sf_e2m3x4_f32_param_3]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e2m3x4_f32_param_4]; +; CHECK-NEXT: cvt.rs.satfinite.e2m3x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %val = call <4 x i8> @llvm.nvvm.ff.to.e2m3x4.rs.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + ret <4 x i8> %val +} + +define <4 x i8> @cvt_rs_relu_sf_e2m3x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +; CHECK-LABEL: cvt_rs_relu_sf_e2m3x4_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_sf_e2m3x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_sf_e2m3x4_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_sf_e2m3x4_f32_param_2]; +; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_relu_sf_e2m3x4_f32_param_3]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e2m3x4_f32_param_4]; +; CHECK-NEXT: cvt.rs.relu.satfinite.e2m3x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %val = call <4 x i8> @llvm.nvvm.ff.to.e2m3x4.rs.relu.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + ret <4 x i8> %val +} + +define <4 x i8> @cvt_rs_sf_e3m2x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +; CHECK-LABEL: cvt_rs_sf_e3m2x4_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_sf_e3m2x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_sf_e3m2x4_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_sf_e3m2x4_f32_param_2]; +; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_sf_e3m2x4_f32_param_3]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e3m2x4_f32_param_4]; +; CHECK-NEXT: cvt.rs.satfinite.e3m2x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %val = call <4 x i8> @llvm.nvvm.ff.to.e3m2x4.rs.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + ret <4 x i8> %val +} + +define <4 x i8> @cvt_rs_relu_sf_e3m2x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +; CHECK-LABEL: cvt_rs_relu_sf_e3m2x4_f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_sf_e3m2x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_sf_e3m2x4_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_sf_e3m2x4_f32_param_2]; +; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_relu_sf_e3m2x4_f32_param_3]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e3m2x4_f32_param_4]; +; CHECK-NEXT: cvt.rs.relu.satfinite.e3m2x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %val = call <4 x i8> @llvm.nvvm.ff.to.e3m2x4.rs.relu.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + ret <4 x i8> %val +} + +; F4X4 conversions + +define i16 @cvt_rs_sf_e2m1x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +; CHECK-LABEL: cvt_rs_sf_e2m1x4_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_sf_e2m1x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_sf_e2m1x4_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_sf_e2m1x4_f32_param_2]; +; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_sf_e2m1x4_f32_param_3]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e2m1x4_f32_param_4]; +; CHECK-NEXT: cvt.rs.satfinite.e2m1x4.f32 %rs1, {%r1, %r2, %r3, %r4}, %r5; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.e2m1x4.rs.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + ret i16 %val +} + +define i16 @cvt_rs_relu_sf_e2m1x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +; CHECK-LABEL: cvt_rs_relu_sf_e2m1x4_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_sf_e2m1x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_sf_e2m1x4_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_sf_e2m1x4_f32_param_2]; +; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_relu_sf_e2m1x4_f32_param_3]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e2m1x4_f32_param_4]; +; CHECK-NEXT: cvt.rs.relu.satfinite.e2m1x4.f32 %rs1, {%r1, %r2, %r3, %r4}, %r5; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.e2m1x4.rs.relu.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + ret i16 %val +} From 2e8acb16e196b687a92b71928c76addd03744534 Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi Date: Mon, 22 Sep 2025 14:01:51 +0530 Subject: [PATCH 2/8] change signature to take in <4 x float> --- clang/include/clang/Basic/BuiltinsNVPTX.td | 20 ++-- clang/test/CodeGen/builtins-nvptx.c | 60 +++++------ llvm/include/llvm/IR/IntrinsicsNVVM.td | 12 +-- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 84 ++++++++++++++- llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 5 + llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 10 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 75 +++++++------ llvm/test/CodeGen/NVPTX/convert-sm103a.ll | 110 +++++++------------- 8 files changed, 223 insertions(+), 153 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td index 0f59fc93cede1..819262d87a917 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.td +++ b/clang/include/clang/Basic/BuiltinsNVPTX.td @@ -624,10 +624,10 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>; def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>; -def __nvvm_ff_to_e4m3x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_ff_to_e4m3x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_ff_to_e5m2x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_ff_to_e5m2x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e4m3x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e4m3x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e5m2x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e5m2x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; def __nvvm_ff_to_e2m3x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_ff_to_e2m3x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; @@ -639,10 +639,10 @@ def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; -def __nvvm_ff_to_e2m3x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_ff_to_e2m3x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_ff_to_e3m2x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_ff_to_e3m2x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e2m3x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e2m3x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e3m2x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e3m2x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; def __nvvm_ff_to_e2m1x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_ff_to_e2m1x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; @@ -650,8 +650,8 @@ def __nvvm_ff_to_e2m1x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, f def __nvvm_e2m1x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_e2m1x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; -def __nvvm_ff_to_e2m1x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_ff_to_e2m1x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float, float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e2m1x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"short(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e2m1x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 70facdbbabb0e..0cf116ea5c5b4 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -1244,45 +1244,45 @@ __device__ void nvvm_cvt_sm100a_sm103a() { // CHECK_PTX87_SM103a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) __nvvm_ff2bf16x2_rs_relu_satfinite(1.0f, 1.0f, 0); -// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e4m3x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e4m3x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff_to_e4m3x4_rs_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) + __nvvm_f32x4_to_e4m3x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); -// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e4m3x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e4m3x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff_to_e4m3x4_rs_relu_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) + __nvvm_f32x4_to_e4m3x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); -// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e5m2x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e5m2x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff_to_e5m2x4_rs_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) + __nvvm_f32x4_to_e5m2x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); -// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e5m2x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e5m2x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff_to_e5m2x4_rs_relu_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) + __nvvm_f32x4_to_e5m2x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); -// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e2m3x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e2m3x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff_to_e2m3x4_rs_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) + __nvvm_f32x4_to_e2m3x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); -// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e2m3x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e2m3x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff_to_e2m3x4_rs_relu_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) + __nvvm_f32x4_to_e2m3x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); -// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e3m2x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e3m2x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff_to_e3m2x4_rs_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) + __nvvm_f32x4_to_e3m2x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); -// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e3m2x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e3m2x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff_to_e3m2x4_rs_relu_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) + __nvvm_f32x4_to_e3m2x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); -// CHECK_PTX87_SM100a: call i16 @llvm.nvvm.ff.to.e2m1x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call i16 @llvm.nvvm.ff.to.e2m1x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff_to_e2m1x4_rs_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); +// CHECK_PTX87_SM100a: call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) + __nvvm_f32x4_to_e2m1x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); -// CHECK_PTX87_SM100a: call i16 @llvm.nvvm.ff.to.e2m1x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call i16 @llvm.nvvm.ff.to.e2m1x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff_to_e2m1x4_rs_relu_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); +// CHECK_PTX87_SM100a: call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) + __nvvm_f32x4_to_e2m1x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); #endif } diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index abc8c9cb8b94e..d78ef6e7d8b78 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1530,8 +1530,8 @@ let TargetPrefix = "nvvm" in { // RS rounding mode conversions for f8x4 types foreach type = ["e4m3x4", "e5m2x4"] in { foreach relu = ["", "_relu"] in { - def int_nvvm_ff_to_ # type # _rs # relu # _satfinite : NVVMBuiltin, - PureIntrinsic<[llvm_v4i8_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i32_ty]>; + def int_nvvm_f32x4_to_ # type # _rs # relu # _satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_v4i8_ty], [llvm_v4f32_ty, llvm_i32_ty]>; } } @@ -1546,8 +1546,8 @@ let TargetPrefix = "nvvm" in { // RS rounding mode conversions for f4x4 type foreach relu = ["", "_relu"] in { - def int_nvvm_ff_to_e2m1x4_rs # relu # _satfinite : NVVMBuiltin, - PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i32_ty]>; + def int_nvvm_f32x4_to_e2m1x4_rs # relu # _satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_v4f32_ty, llvm_i32_ty]>; } // FP6 conversions. @@ -1564,8 +1564,8 @@ let TargetPrefix = "nvvm" in { // RS rounding mode conversions for f6x4 types foreach type = ["e2m3x4", "e3m2x4"] in { foreach relu = ["", "_relu"] in { - def int_nvvm_ff_to_ # type # _rs # relu # _satfinite : NVVMBuiltin, - PureIntrinsic<[llvm_v4i8_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i32_ty]>; + def int_nvvm_f32x4_to_ # type # _rs # relu # _satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_v4i8_ty], [llvm_v4f32_ty, llvm_i32_ty]>; } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 68935784128c0..07798f6fed6a3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1077,9 +1077,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // Enable custom lowering for the following: // * MVT::i128 - clusterlaunchcontrol // * MVT::i32 - prmt + // * MVT::v4f32 - cvt_rs fp{4/6/8}x4 intrinsics // * MVT::Other - internal.addrspace.wrap - setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::Other}, - Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, + {MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom); } const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -1162,6 +1163,11 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT) MAKE_CASE( NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT) + MAKE_CASE(NVPTXISD::CVT_E4M3X4_F32X4_RS_SF) + MAKE_CASE(NVPTXISD::CVT_E5M2X4_F32X4_RS_SF) + MAKE_CASE(NVPTXISD::CVT_E2M3X4_F32X4_RS_SF) + MAKE_CASE(NVPTXISD::CVT_E3M2X4_F32X4_RS_SF) + MAKE_CASE(NVPTXISD::CVT_E2M1X4_F32X4_RS_SF) } return nullptr; @@ -2839,6 +2845,69 @@ static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, {TryCancelResponse0, TryCancelResponse1}); } +bool isCvtRSReluIntrinsic(Intrinsic::ID ID) { + switch (ID) { + case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite: + case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite: + case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite: + case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite: + case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite: + return true; + default: + return false; + } +} + +static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG) { + SDNode *N = Op.getNode(); + SDLoc DL(N); + SDValue F32Vec = N->getOperand(1); + SDValue RBits = N->getOperand(2); + + unsigned IntrinsicID = N->getConstantOperandVal(0); + + uint32_t CvtModeFlag = NVPTX::PTXCvtMode::CvtMode::RS; + if (isCvtRSReluIntrinsic(IntrinsicID)) + CvtModeFlag |= NVPTX::PTXCvtMode::CvtMode::RELU_FLAG; + + SDValue Float1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec, + DAG.getIntPtrConstant(0, DL)); + SDValue Float2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec, + DAG.getIntPtrConstant(1, DL)); + SDValue Float3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec, + DAG.getIntPtrConstant(2, DL)); + SDValue Float4 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec, + DAG.getIntPtrConstant(3, DL)); + + auto OpSignature = + [&]() -> std::pair { + switch (IntrinsicID) { + case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite: + case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite: + return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8}; + case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite: + case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite: + return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8}; + case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite: + case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite: + return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8}; + case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite: + case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite: + return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8}; + case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite: + case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite: + return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16}; + default: + llvm_unreachable("unsupported/unhandled intrinsic"); + } + }(); + + SDValue Ops[] = {Float1, Float2, Float3, + Float4, RBits, DAG.getConstant(CvtModeFlag, DL, MVT::i32)}; + + return DAG.getNode(OpSignature.first, DL, OpSignature.second, Ops); +} + static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) { const unsigned Mode = [&]() { switch (Op->getConstantOperandVal(0)) { @@ -2886,6 +2955,17 @@ static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) { case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y: case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z: return LowerClusterLaunchControlQueryCancel(Op, DAG); + case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite: + case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite: + case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite: + case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite: + case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite: + case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite: + case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite: + case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite: + case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite: + case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite: + return lowerCvtRSIntrinsics(Op, DAG); } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 769d2fe46f2c8..63fa0bb9159ff 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -79,6 +79,11 @@ enum NodeType : unsigned { CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X, CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y, CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z, + CVT_E4M3X4_F32X4_RS_SF, + CVT_E5M2X4_F32X4_RS_SF, + CVT_E2M3X4_F32X4_RS_SF, + CVT_E3M2X4_F32X4_RS_SF, + CVT_E2M1X4_F32X4_RS_SF, FIRST_MEMORY_OPCODE, diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 637410f586a9e..e45c6f592eeac 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -645,8 +645,8 @@ let hasSideEffects = false in { CvtMode:$mode), "cvt${mode:base}${mode:relu}.satfinite." # F8Name # "x4.f32 \t$dst, {{$src1, $src2, $src3, $src4}}, $src5;">; - def CVT_e4m3x4_f32_rs_sf : CVT_TO_FP8X4<"e4m3">; - def CVT_e5m2x4_f32_rs_sf : CVT_TO_FP8X4<"e5m2">; + def CVT_e4m3x4_f32x4_rs_sf : CVT_TO_FP8X4<"e4m3">; + def CVT_e5m2x4_f32x4_rs_sf : CVT_TO_FP8X4<"e5m2">; // Float to TF32 conversions multiclass CVT_TO_TF32 Preds = [hasPTX<78>, hasSM<90>]> { @@ -686,8 +686,8 @@ let hasSideEffects = false in { CvtMode:$mode), "cvt${mode:base}${mode:relu}.satfinite." # F6Name # "x4.f32 \t$dst, {{$src1, $src2, $src3, $src4}}, $src5;">; - def CVT_e2m3x4_f32_rs_sf : CVT_TO_FP6X4<"e2m3">; - def CVT_e3m2x4_f32_rs_sf : CVT_TO_FP6X4<"e3m2">; + def CVT_e2m3x4_f32x4_rs_sf : CVT_TO_FP6X4<"e2m3">; + def CVT_e3m2x4_f32x4_rs_sf : CVT_TO_FP6X4<"e3m2">; // FP4 conversions. def CVT_e2m1x2_f32_sf : NVPTXInst<(outs B16:$dst), @@ -706,7 +706,7 @@ let hasSideEffects = false in { "cvt${mode:base}${mode:relu}.f16x2.e2m1x2 \t$dst, %e2m1x2_in; \n\t", "}}"), []>; - def CVT_e2m1x4_f32_rs_sf : + def CVT_e2m1x4_f32x4_rs_sf : NVPTXInst<(outs B16:$dst), (ins B32:$src1, B32:$src2, B32:$src3, B32:$src4, B32:$src5, CvtMode:$mode), diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 16debba5002f8..b1110e94c34e9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1950,43 +1950,58 @@ let Predicates = [hasPTX<86>, hasSM<100>, hasArchAccelFeatures] in { (CVT_bf16x2_ue8m0x2 $a)>; } +def SDT_CVT_F32X4_TO_FP8X4_RS : + SDTypeProfile<1, 6, [SDTCisVec<0>, SDTCisFP<1>, SDTCisFP<2>, SDTCisFP<3>, + SDTCisFP<4>, SDTCisInt<5>, SDTCisInt<6>]>; + +def SDT_CVT_F32X4_TO_FP6X4_RS : + SDTypeProfile<1, 6, [SDTCisVec<0>, SDTCisFP<1>, SDTCisFP<2>, SDTCisFP<3>, + SDTCisFP<4>, SDTCisInt<5>, SDTCisInt<6>]>; + +def SDT_CVT_F32X4_TO_FP4X4_RS : + SDTypeProfile<1, 6, [SDTCisInt<0>, SDTCisFP<1>, SDTCisFP<2>, SDTCisFP<3>, + SDTCisFP<4>, SDTCisInt<5>, SDTCisInt<6>]>; + +class CVT_F32X4_TO_FPX4_RS_SF_NODE : + SDNode<"NVPTXISD::CVT_" # FPName # "X4_F32X4_RS_SF", SDT, []>; + // RS rounding mode conversions let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in { // FP8x4 conversions -def : Pat<(int_nvvm_ff_to_e4m3x4_rs_satfinite - f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), - (CVT_e4m3x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS)>; -def : Pat<(int_nvvm_ff_to_e4m3x4_rs_relu_satfinite - f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), - (CVT_e4m3x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS_RELU)>; -def : Pat<(int_nvvm_ff_to_e5m2x4_rs_satfinite - f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), - (CVT_e5m2x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS)>; -def : Pat<(int_nvvm_ff_to_e5m2x4_rs_relu_satfinite - f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), - (CVT_e5m2x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS_RELU)>; +def : Pat<(v4i8 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E4M3", SDT_CVT_F32X4_TO_FP8X4_RS> + f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)), + (CVT_e4m3x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS)>; +def : Pat<(v4i8 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E5M2", SDT_CVT_F32X4_TO_FP8X4_RS> + f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)), + (CVT_e5m2x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS)>; +def : Pat<(v4i8 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E4M3", SDT_CVT_F32X4_TO_FP8X4_RS> + f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)), + (CVT_e4m3x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS_RELU)>; +def : Pat<(v4i8 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E5M2", SDT_CVT_F32X4_TO_FP8X4_RS> + f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)), + (CVT_e5m2x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS_RELU)>; // FP6x4 conversions -def : Pat<(int_nvvm_ff_to_e2m3x4_rs_satfinite - f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), - (CVT_e2m3x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS)>; -def : Pat<(int_nvvm_ff_to_e2m3x4_rs_relu_satfinite - f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), - (CVT_e2m3x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS_RELU)>; -def : Pat<(int_nvvm_ff_to_e3m2x4_rs_satfinite - f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), - (CVT_e3m2x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS)>; -def : Pat<(int_nvvm_ff_to_e3m2x4_rs_relu_satfinite - f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), - (CVT_e3m2x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS_RELU)>; +def : Pat<(v4i8 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E2M3", SDT_CVT_F32X4_TO_FP6X4_RS> + f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)), + (CVT_e2m3x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS)>; +def : Pat<(v4i8 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E3M2", SDT_CVT_F32X4_TO_FP6X4_RS> + f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)), + (CVT_e3m2x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS)>; +def : Pat<(v4i8 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E2M3", SDT_CVT_F32X4_TO_FP6X4_RS> + f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)), + (CVT_e2m3x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS_RELU)>; +def : Pat<(v4i8 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E3M2", SDT_CVT_F32X4_TO_FP6X4_RS> + f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)), + (CVT_e3m2x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS_RELU)>; // FP4x4 conversions -def : Pat<(int_nvvm_ff_to_e2m1x4_rs_satfinite - f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), - (CVT_e2m1x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS)>; -def : Pat<(int_nvvm_ff_to_e2m1x4_rs_relu_satfinite - f32:$a, f32:$b, f32:$c, f32:$d, i32:$e), - (CVT_e2m1x4_f32_rs_sf $a, $b, $c, $d, $e, CvtRS_RELU)>; +def : Pat<(i16 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E2M1", SDT_CVT_F32X4_TO_FP4X4_RS> + f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)), + (CVT_e2m1x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS)>; +def : Pat<(i16 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E2M1", SDT_CVT_F32X4_TO_FP4X4_RS> + f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)), + (CVT_e2m1x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS_RELU)>; } // diff --git a/llvm/test/CodeGen/NVPTX/convert-sm103a.ll b/llvm/test/CodeGen/NVPTX/convert-sm103a.ll index 39a9ed18da71c..54b4dd88867ed 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm103a.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm103a.ll @@ -138,190 +138,160 @@ define <2 x bfloat> @cvt_rs_relu_sf_bf16x2_f32(float %f1, float %f2, i32 %rbits) ; F8X4 conversions -define <4 x i8> @cvt_rs_sf_e4m3x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +define <4 x i8> @cvt_rs_sf_e4m3x4_f32(<4 x float> %fvec, i32 %rbits) { ; CHECK-LABEL: cvt_rs_sf_e4m3x4_f32( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_sf_e4m3x4_f32_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_sf_e4m3x4_f32_param_1]; -; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_sf_e4m3x4_f32_param_2]; -; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_sf_e4m3x4_f32_param_3]; -; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e4m3x4_f32_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_sf_e4m3x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e4m3x4_f32_param_1]; ; CHECK-NEXT: cvt.rs.satfinite.e4m3x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; - %val = call <4 x i8> @llvm.nvvm.ff.to.e4m3x4.rs.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + %val = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> %fvec, i32 %rbits) ret <4 x i8> %val } -define <4 x i8> @cvt_rs_relu_sf_e4m3x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +define <4 x i8> @cvt_rs_relu_sf_e4m3x4_f32(<4 x float> %fvec, i32 %rbits) { ; CHECK-LABEL: cvt_rs_relu_sf_e4m3x4_f32( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_sf_e4m3x4_f32_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_sf_e4m3x4_f32_param_1]; -; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_sf_e4m3x4_f32_param_2]; -; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_relu_sf_e4m3x4_f32_param_3]; -; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e4m3x4_f32_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_relu_sf_e4m3x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e4m3x4_f32_param_1]; ; CHECK-NEXT: cvt.rs.relu.satfinite.e4m3x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; - %val = call <4 x i8> @llvm.nvvm.ff.to.e4m3x4.rs.relu.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + %val = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> %fvec, i32 %rbits) ret <4 x i8> %val } -define <4 x i8> @cvt_rs_sf_e5m2x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +define <4 x i8> @cvt_rs_sf_e5m2x4_f32(<4 x float> %fvec, i32 %rbits) { ; CHECK-LABEL: cvt_rs_sf_e5m2x4_f32( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_sf_e5m2x4_f32_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_sf_e5m2x4_f32_param_1]; -; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_sf_e5m2x4_f32_param_2]; -; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_sf_e5m2x4_f32_param_3]; -; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e5m2x4_f32_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_sf_e5m2x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e5m2x4_f32_param_1]; ; CHECK-NEXT: cvt.rs.satfinite.e5m2x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; - %val = call <4 x i8> @llvm.nvvm.ff.to.e5m2x4.rs.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + %val = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> %fvec, i32 %rbits) ret <4 x i8> %val } -define <4 x i8> @cvt_rs_relu_sf_e5m2x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +define <4 x i8> @cvt_rs_relu_sf_e5m2x4_f32(<4 x float> %fvec, i32 %rbits) { ; CHECK-LABEL: cvt_rs_relu_sf_e5m2x4_f32( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_sf_e5m2x4_f32_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_sf_e5m2x4_f32_param_1]; -; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_sf_e5m2x4_f32_param_2]; -; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_relu_sf_e5m2x4_f32_param_3]; -; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e5m2x4_f32_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_relu_sf_e5m2x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e5m2x4_f32_param_1]; ; CHECK-NEXT: cvt.rs.relu.satfinite.e5m2x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; - %val = call <4 x i8> @llvm.nvvm.ff.to.e5m2x4.rs.relu.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + %val = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> %fvec, i32 %rbits) ret <4 x i8> %val } ; F6X4 conversions -define <4 x i8> @cvt_rs_sf_e2m3x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +define <4 x i8> @cvt_rs_sf_e2m3x4_f32(<4 x float> %fvec, i32 %rbits) { ; CHECK-LABEL: cvt_rs_sf_e2m3x4_f32( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_sf_e2m3x4_f32_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_sf_e2m3x4_f32_param_1]; -; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_sf_e2m3x4_f32_param_2]; -; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_sf_e2m3x4_f32_param_3]; -; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e2m3x4_f32_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_sf_e2m3x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e2m3x4_f32_param_1]; ; CHECK-NEXT: cvt.rs.satfinite.e2m3x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; - %val = call <4 x i8> @llvm.nvvm.ff.to.e2m3x4.rs.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + %val = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> %fvec, i32 %rbits) ret <4 x i8> %val } -define <4 x i8> @cvt_rs_relu_sf_e2m3x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +define <4 x i8> @cvt_rs_relu_sf_e2m3x4_f32(<4 x float> %fvec, i32 %rbits) { ; CHECK-LABEL: cvt_rs_relu_sf_e2m3x4_f32( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_sf_e2m3x4_f32_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_sf_e2m3x4_f32_param_1]; -; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_sf_e2m3x4_f32_param_2]; -; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_relu_sf_e2m3x4_f32_param_3]; -; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e2m3x4_f32_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_relu_sf_e2m3x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e2m3x4_f32_param_1]; ; CHECK-NEXT: cvt.rs.relu.satfinite.e2m3x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; - %val = call <4 x i8> @llvm.nvvm.ff.to.e2m3x4.rs.relu.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + %val = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> %fvec, i32 %rbits) ret <4 x i8> %val } -define <4 x i8> @cvt_rs_sf_e3m2x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +define <4 x i8> @cvt_rs_sf_e3m2x4_f32(<4 x float> %fvec, i32 %rbits) { ; CHECK-LABEL: cvt_rs_sf_e3m2x4_f32( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_sf_e3m2x4_f32_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_sf_e3m2x4_f32_param_1]; -; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_sf_e3m2x4_f32_param_2]; -; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_sf_e3m2x4_f32_param_3]; -; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e3m2x4_f32_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_sf_e3m2x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e3m2x4_f32_param_1]; ; CHECK-NEXT: cvt.rs.satfinite.e3m2x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; - %val = call <4 x i8> @llvm.nvvm.ff.to.e3m2x4.rs.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + %val = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> %fvec, i32 %rbits) ret <4 x i8> %val } -define <4 x i8> @cvt_rs_relu_sf_e3m2x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +define <4 x i8> @cvt_rs_relu_sf_e3m2x4_f32(<4 x float> %fvec, i32 %rbits) { ; CHECK-LABEL: cvt_rs_relu_sf_e3m2x4_f32( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_sf_e3m2x4_f32_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_sf_e3m2x4_f32_param_1]; -; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_sf_e3m2x4_f32_param_2]; -; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_relu_sf_e3m2x4_f32_param_3]; -; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e3m2x4_f32_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_relu_sf_e3m2x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e3m2x4_f32_param_1]; ; CHECK-NEXT: cvt.rs.relu.satfinite.e3m2x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; - %val = call <4 x i8> @llvm.nvvm.ff.to.e3m2x4.rs.relu.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + %val = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> %fvec, i32 %rbits) ret <4 x i8> %val } ; F4X4 conversions -define i16 @cvt_rs_sf_e2m1x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +define i16 @cvt_rs_sf_e2m1x4_f32(<4 x float> %fvec, i32 %rbits) { ; CHECK-LABEL: cvt_rs_sf_e2m1x4_f32( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_sf_e2m1x4_f32_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_sf_e2m1x4_f32_param_1]; -; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_sf_e2m1x4_f32_param_2]; -; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_sf_e2m1x4_f32_param_3]; -; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e2m1x4_f32_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_sf_e2m1x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_sf_e2m1x4_f32_param_1]; ; CHECK-NEXT: cvt.rs.satfinite.e2m1x4.f32 %rs1, {%r1, %r2, %r3, %r4}, %r5; ; CHECK-NEXT: cvt.u32.u16 %r6, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; - %val = call i16 @llvm.nvvm.ff.to.e2m1x4.rs.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + %val = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> %fvec, i32 %rbits) ret i16 %val } -define i16 @cvt_rs_relu_sf_e2m1x4_f32(float %f1, float %f2, float %f3, float %f4, i32 %rbits) { +define i16 @cvt_rs_relu_sf_e2m1x4_f32(<4 x float> %fvec, i32 %rbits) { ; CHECK-LABEL: cvt_rs_relu_sf_e2m1x4_f32( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [cvt_rs_relu_sf_e2m1x4_f32_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [cvt_rs_relu_sf_e2m1x4_f32_param_1]; -; CHECK-NEXT: ld.param.b32 %r3, [cvt_rs_relu_sf_e2m1x4_f32_param_2]; -; CHECK-NEXT: ld.param.b32 %r4, [cvt_rs_relu_sf_e2m1x4_f32_param_3]; -; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e2m1x4_f32_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_relu_sf_e2m1x4_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r5, [cvt_rs_relu_sf_e2m1x4_f32_param_1]; ; CHECK-NEXT: cvt.rs.relu.satfinite.e2m1x4.f32 %rs1, {%r1, %r2, %r3, %r4}, %r5; ; CHECK-NEXT: cvt.u32.u16 %r6, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; - %val = call i16 @llvm.nvvm.ff.to.e2m1x4.rs.relu.satfinite(float %f1, float %f2, float %f3, float %f4, i32 %rbits) + %val = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> %fvec, i32 %rbits) ret i16 %val } From 52c12523ab1c70ab37c20c2fe9a8ba37e9a9a3d6 Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi Date: Wed, 24 Sep 2025 16:13:35 +0530 Subject: [PATCH 3/8] add comment --- llvm/include/llvm/IR/IntrinsicsNVVM.td | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index d78ef6e7d8b78..23d878f726c5e 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1493,7 +1493,8 @@ let TargetPrefix = "nvvm" in { } } - // RS rounding mode conversions for f16x2, bf16x2 types + // RS rounding mode (Stochastic Rounding) conversions for f16x2, bf16x2 types + // The last i32 operand provides the random bits for the conversion foreach relu = ["", "_relu"] in { foreach satfinite = ["", "_satfinite"] in { def int_nvvm_ff2f16x2_rs # relu # satfinite : NVVMBuiltin, @@ -1527,7 +1528,8 @@ let TargetPrefix = "nvvm" in { } } - // RS rounding mode conversions for f8x4 types + // RS rounding mode (Stochastic Rounding) conversions for f8x4 types + // The last i32 operand provides the random bits for the conversion foreach type = ["e4m3x4", "e5m2x4"] in { foreach relu = ["", "_relu"] in { def int_nvvm_f32x4_to_ # type # _rs # relu # _satfinite : NVVMBuiltin, @@ -1544,7 +1546,8 @@ let TargetPrefix = "nvvm" in { PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; } - // RS rounding mode conversions for f4x4 type + // RS rounding mode (Stochastic Rounding) conversions for f4x4 type + // The last i32 operand provides the random bits for the conversion foreach relu = ["", "_relu"] in { def int_nvvm_f32x4_to_e2m1x4_rs # relu # _satfinite : NVVMBuiltin, PureIntrinsic<[llvm_i16_ty], [llvm_v4f32_ty, llvm_i32_ty]>; @@ -1561,7 +1564,8 @@ let TargetPrefix = "nvvm" in { } } - // RS rounding mode conversions for f6x4 types + // RS rounding mode (Stochastic Rounding) conversions for f6x4 types + // The last i32 operand provides the random bits for the conversion foreach type = ["e2m3x4", "e3m2x4"] in { foreach relu = ["", "_relu"] in { def int_nvvm_f32x4_to_ # type # _rs # relu # _satfinite : NVVMBuiltin, From 5bd91954a2c47150d7a539231b203c2fe6962226 Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi Date: Thu, 25 Sep 2025 14:05:32 +0530 Subject: [PATCH 4/8] address comments --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 40 +++++--------- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 60 +++++++++------------ 2 files changed, 40 insertions(+), 60 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 07798f6fed6a3..8f3be88afd544 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2845,19 +2845,6 @@ static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, {TryCancelResponse0, TryCancelResponse1}); } -bool isCvtRSReluIntrinsic(Intrinsic::ID ID) { - switch (ID) { - case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite: - case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite: - case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite: - case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite: - case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite: - return true; - default: - return false; - } -} - static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG) { SDNode *N = Op.getNode(); SDLoc DL(N); @@ -2867,34 +2854,35 @@ static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG) { unsigned IntrinsicID = N->getConstantOperandVal(0); uint32_t CvtModeFlag = NVPTX::PTXCvtMode::CvtMode::RS; - if (isCvtRSReluIntrinsic(IntrinsicID)) - CvtModeFlag |= NVPTX::PTXCvtMode::CvtMode::RELU_FLAG; - - SDValue Float1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec, - DAG.getIntPtrConstant(0, DL)); - SDValue Float2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec, - DAG.getIntPtrConstant(1, DL)); - SDValue Float3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec, - DAG.getIntPtrConstant(2, DL)); - SDValue Float4 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec, - DAG.getIntPtrConstant(3, DL)); + + // Extract the 4 float elements from the vector + SmallVector Ops; + for (unsigned i = 0; i < 4; ++i) { + Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec, + DAG.getIntPtrConstant(i, DL))); + } auto OpSignature = [&]() -> std::pair { switch (IntrinsicID) { case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite: + CvtModeFlag |= NVPTX::PTXCvtMode::CvtMode::RELU_FLAG; case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite: return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8}; case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite: + CvtModeFlag |= NVPTX::PTXCvtMode::CvtMode::RELU_FLAG; case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite: return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8}; case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite: + CvtModeFlag |= NVPTX::PTXCvtMode::CvtMode::RELU_FLAG; case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite: return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8}; case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite: + CvtModeFlag |= NVPTX::PTXCvtMode::CvtMode::RELU_FLAG; case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite: return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8}; case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite: + CvtModeFlag |= NVPTX::PTXCvtMode::CvtMode::RELU_FLAG; case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite: return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16}; default: @@ -2902,8 +2890,8 @@ static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG) { } }(); - SDValue Ops[] = {Float1, Float2, Float3, - Float4, RBits, DAG.getConstant(CvtModeFlag, DL, MVT::i32)}; + Ops.push_back(RBits); + Ops.push_back(DAG.getConstant(CvtModeFlag, DL, MVT::i32)); return DAG.getNode(OpSignature.first, DL, OpSignature.second, Ops); } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index b1110e94c34e9..2c525461836d8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1950,57 +1950,49 @@ let Predicates = [hasPTX<86>, hasSM<100>, hasArchAccelFeatures] in { (CVT_bf16x2_ue8m0x2 $a)>; } -def SDT_CVT_F32X4_TO_FP8X4_RS : +def SDT_CVT_F32X4_TO_FPX4_RS_VEC : SDTypeProfile<1, 6, [SDTCisVec<0>, SDTCisFP<1>, SDTCisFP<2>, SDTCisFP<3>, SDTCisFP<4>, SDTCisInt<5>, SDTCisInt<6>]>; -def SDT_CVT_F32X4_TO_FP6X4_RS : - SDTypeProfile<1, 6, [SDTCisVec<0>, SDTCisFP<1>, SDTCisFP<2>, SDTCisFP<3>, - SDTCisFP<4>, SDTCisInt<5>, SDTCisInt<6>]>; - -def SDT_CVT_F32X4_TO_FP4X4_RS : +def SDT_CVT_F32X4_TO_FPX4_RS_INT : SDTypeProfile<1, 6, [SDTCisInt<0>, SDTCisFP<1>, SDTCisFP<2>, SDTCisFP<3>, SDTCisFP<4>, SDTCisInt<5>, SDTCisInt<6>]>; class CVT_F32X4_TO_FPX4_RS_SF_NODE : SDNode<"NVPTXISD::CVT_" # FPName # "X4_F32X4_RS_SF", SDT, []>; + +multiclass CVT_F32X4_TO_FPX4_RS_SF_VEC { + def : Pat<(RetTy (CVT_F32X4_TO_FPX4_RS_SF_NODE + f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)), + (!cast(CVT_ # FPName # "x4_f32x4_rs_sf") + $f1, $f2, $f3, $f4, $rbits, CvtRS)>; + + def : Pat<(RetTy (CVT_F32X4_TO_FPX4_RS_SF_NODE + f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)), + (!cast(CVT_ # FPName # "x4_f32x4_rs_sf") + $f1, $f2, $f3, $f4, $rbits, CvtRS_RELU)>; +} // RS rounding mode conversions let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in { // FP8x4 conversions -def : Pat<(v4i8 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E4M3", SDT_CVT_F32X4_TO_FP8X4_RS> - f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)), - (CVT_e4m3x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS)>; -def : Pat<(v4i8 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E5M2", SDT_CVT_F32X4_TO_FP8X4_RS> - f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)), - (CVT_e5m2x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS)>; -def : Pat<(v4i8 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E4M3", SDT_CVT_F32X4_TO_FP8X4_RS> - f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)), - (CVT_e4m3x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS_RELU)>; -def : Pat<(v4i8 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E5M2", SDT_CVT_F32X4_TO_FP8X4_RS> - f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)), - (CVT_e5m2x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS_RELU)>; +def : CVT_F32X4_TO_FPX4_RS_SF_VEC<"E4M3", v4i8>; +def : CVT_F32X4_TO_FPX4_RS_SF_VEC<"E5M2", v4i8>; // FP6x4 conversions -def : Pat<(v4i8 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E2M3", SDT_CVT_F32X4_TO_FP6X4_RS> - f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)), - (CVT_e2m3x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS)>; -def : Pat<(v4i8 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E3M2", SDT_CVT_F32X4_TO_FP6X4_RS> - f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)), - (CVT_e3m2x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS)>; -def : Pat<(v4i8 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E2M3", SDT_CVT_F32X4_TO_FP6X4_RS> - f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)), - (CVT_e2m3x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS_RELU)>; -def : Pat<(v4i8 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E3M2", SDT_CVT_F32X4_TO_FP6X4_RS> - f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)), - (CVT_e3m2x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS_RELU)>; +def : CVT_F32X4_TO_FPX4_RS_SF_VEC<"E2M3", v4i8>; +def : CVT_F32X4_TO_FPX4_RS_SF_VEC<"E3M2", v4i8>; // FP4x4 conversions -def : Pat<(i16 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E2M1", SDT_CVT_F32X4_TO_FP4X4_RS> - f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)), +def : Pat<(i16 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E2M1", + SDT_CVT_F32X4_TO_FPX4_RS_INT> + f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)), (CVT_e2m1x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS)>; -def : Pat<(i16 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E2M1", SDT_CVT_F32X4_TO_FP4X4_RS> - f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)), +def : Pat<(i16 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E2M1", + SDT_CVT_F32X4_TO_FPX4_RS_INT> + f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)), (CVT_e2m1x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS_RELU)>; } From d1f60de23c5b51f389f4a833f90b49005cd176d5 Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi Date: Thu, 25 Sep 2025 14:25:52 +0530 Subject: [PATCH 5/8] fix errors --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 2c525461836d8..28d4bb917ff69 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1965,25 +1965,25 @@ multiclass CVT_F32X4_TO_FPX4_RS_SF_VEC { def : Pat<(RetTy (CVT_F32X4_TO_FPX4_RS_SF_NODE f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)), - (!cast(CVT_ # FPName # "x4_f32x4_rs_sf") + (!cast("CVT_" # FPName # "x4_f32x4_rs_sf") $f1, $f2, $f3, $f4, $rbits, CvtRS)>; def : Pat<(RetTy (CVT_F32X4_TO_FPX4_RS_SF_NODE f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)), - (!cast(CVT_ # FPName # "x4_f32x4_rs_sf") + (!cast("CVT_" # FPName # "x4_f32x4_rs_sf") $f1, $f2, $f3, $f4, $rbits, CvtRS_RELU)>; } // RS rounding mode conversions let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in { // FP8x4 conversions -def : CVT_F32X4_TO_FPX4_RS_SF_VEC<"E4M3", v4i8>; -def : CVT_F32X4_TO_FPX4_RS_SF_VEC<"E5M2", v4i8>; +defm : CVT_F32X4_TO_FPX4_RS_SF_VEC<"e4m3", v4i8>; +defm : CVT_F32X4_TO_FPX4_RS_SF_VEC<"e5m2", v4i8>; // FP6x4 conversions -def : CVT_F32X4_TO_FPX4_RS_SF_VEC<"E2M3", v4i8>; -def : CVT_F32X4_TO_FPX4_RS_SF_VEC<"E3M2", v4i8>; +defm : CVT_F32X4_TO_FPX4_RS_SF_VEC<"e2m3", v4i8>; +defm : CVT_F32X4_TO_FPX4_RS_SF_VEC<"e3m2", v4i8>; // FP4x4 conversions def : Pat<(i16 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E2M1", From b8e5da64081022dc9b09841b4a1e86809bdbf80f Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi Date: Thu, 25 Sep 2025 17:55:05 +0530 Subject: [PATCH 6/8] fix fallthrough warning --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 35 ++++++++++++--------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 8f3be88afd544..fbde2c14a4e02 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2853,8 +2853,6 @@ static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG) { unsigned IntrinsicID = N->getConstantOperandVal(0); - uint32_t CvtModeFlag = NVPTX::PTXCvtMode::CvtMode::RS; - // Extract the 4 float elements from the vector SmallVector Ops; for (unsigned i = 0; i < 4; ++i) { @@ -2862,29 +2860,36 @@ static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG) { DAG.getIntPtrConstant(i, DL))); } - auto OpSignature = - [&]() -> std::pair { + using NVPTX::PTXCvtMode::CvtMode; + + auto [OpCode, RetTy, CvtModeFlag] = + [&]() -> std::tuple { switch (IntrinsicID) { case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite: - CvtModeFlag |= NVPTX::PTXCvtMode::CvtMode::RELU_FLAG; + return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8, + CvtMode::RS | CvtMode::RELU_FLAG}; case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite: - return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8}; + return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS}; case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite: - CvtModeFlag |= NVPTX::PTXCvtMode::CvtMode::RELU_FLAG; + return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8, + CvtMode::RS | CvtMode::RELU_FLAG}; case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite: - return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8}; + return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS}; case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite: - CvtModeFlag |= NVPTX::PTXCvtMode::CvtMode::RELU_FLAG; + return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8, + CvtMode::RS | CvtMode::RELU_FLAG}; case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite: - return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8}; + return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS}; case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite: - CvtModeFlag |= NVPTX::PTXCvtMode::CvtMode::RELU_FLAG; + return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8, + CvtMode::RS | CvtMode::RELU_FLAG}; case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite: - return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8}; + return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS}; case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite: - CvtModeFlag |= NVPTX::PTXCvtMode::CvtMode::RELU_FLAG; + return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16, + CvtMode::RS | CvtMode::RELU_FLAG}; case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite: - return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16}; + return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16, CvtMode::RS}; default: llvm_unreachable("unsupported/unhandled intrinsic"); } @@ -2893,7 +2898,7 @@ static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG) { Ops.push_back(RBits); Ops.push_back(DAG.getConstant(CvtModeFlag, DL, MVT::i32)); - return DAG.getNode(OpSignature.first, DL, OpSignature.second, Ops); + return DAG.getNode(OpCode, DL, RetTy, Ops); } static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) { From 6f3cc7bf6f13ece40b119563cf73ad5a1c81c96a Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi Date: Fri, 26 Sep 2025 14:53:40 +0530 Subject: [PATCH 7/8] address comments --- clang/include/clang/Basic/BuiltinsNVPTX.td | 72 ++++++-- clang/test/CodeGen/builtins-nvptx.c | 180 ++++++++++++-------- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 3 +- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 36 ++-- 4 files changed, 185 insertions(+), 106 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td index 819262d87a917..d923d2a90e908 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.td +++ b/clang/include/clang/Basic/BuiltinsNVPTX.td @@ -579,19 +579,35 @@ def __nvvm_ff2bf16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float) def __nvvm_ff2bf16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>; def __nvvm_ff2bf16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>; def __nvvm_ff2bf16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>; -def __nvvm_ff2bf16x2_rs : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_ff2bf16x2_rs_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_ff2bf16x2_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_ff2bf16x2_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff2bf16x2_rs : + NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff2bf16x2_rs_relu : + NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff2bf16x2_rs_satfinite : + NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff2bf16x2_rs_relu_satfinite : + NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; def __nvvm_ff2f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>; def __nvvm_ff2f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>; def __nvvm_ff2f16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>; def __nvvm_ff2f16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>; -def __nvvm_ff2f16x2_rs : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_ff2f16x2_rs_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_ff2f16x2_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_ff2f16x2_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff2f16x2_rs : + NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff2f16x2_rs_relu : + NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff2f16x2_rs_satfinite : + NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_ff2f16x2_rs_relu_satfinite : + NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; def __nvvm_f2bf16_rn : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; def __nvvm_f2bf16_rn_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; @@ -624,10 +640,18 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>; def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>; -def __nvvm_f32x4_to_e4m3x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_f32x4_to_e4m3x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_f32x4_to_e5m2x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_f32x4_to_e5m2x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e4m3x4_rs_satfinite : + NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e4m3x4_rs_relu_satfinite : + NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e5m2x4_rs_satfinite : + NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e5m2x4_rs_relu_satfinite : + NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; def __nvvm_ff_to_e2m3x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_ff_to_e2m3x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; @@ -639,10 +663,18 @@ def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; -def __nvvm_f32x4_to_e2m3x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_f32x4_to_e2m3x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_f32x4_to_e3m2x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_f32x4_to_e3m2x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e2m3x4_rs_satfinite : + NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e2m3x4_rs_relu_satfinite : + NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e3m2x4_rs_satfinite : + NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e3m2x4_rs_relu_satfinite : + NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; def __nvvm_ff_to_e2m1x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_ff_to_e2m1x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; @@ -650,8 +682,12 @@ def __nvvm_ff_to_e2m1x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, f def __nvvm_e2m1x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_e2m1x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; -def __nvvm_f32x4_to_e2m1x4_rs_satfinite : NVPTXBuiltinSMAndPTX<"short(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; -def __nvvm_f32x4_to_e2m1x4_rs_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(_Vector<4, float>, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e2m1x4_rs_satfinite : + NVPTXBuiltinSMAndPTX<"short(_Vector<4, float>, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; +def __nvvm_f32x4_to_e2m1x4_rs_relu_satfinite : + NVPTXBuiltinSMAndPTX<"short(_Vector<4, float>, uint32_t)", + SM<"100a", [SM_103a]>, PTX87>; def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 0cf116ea5c5b4..4f14e9dcc8a72 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -1212,77 +1212,117 @@ __device__ void nvvm_cvt_sm100a_sm101a_sm120a() { __device__ void nvvm_cvt_sm100a_sm103a() { #if (PTX >= 87) && (__CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM103_ALL) -// CHECK_PTX87_SM100a: call <2 x half> @llvm.nvvm.ff2f16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call <2 x half> @llvm.nvvm.ff2f16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff2f16x2_rs(1.0f, 1.0f, 0); + typedef __fp16 f16x2 __attribute__((ext_vector_type(2))); + typedef __bf16 bf16x2 __attribute__((ext_vector_type(2))); + typedef char uint8x4 __attribute__((ext_vector_type(4))); + +// CHECK_PTX87_SM100a: %[[R1:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM100a: store <2 x half> %[[R1]], ptr %r1 +// CHECK_PTX87_SM103a: %[[R1:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: store <2 x half> %[[R1]], ptr %r1 + f16x2 r1 = __nvvm_ff2f16x2_rs(1.0f, 1.0f, 0); -// CHECK_PTX87_SM100a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff2f16x2_rs_relu(1.0f, 1.0f, 0); - -// CHECK_PTX87_SM100a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff2f16x2_rs_satfinite(1.0f, 1.0f, 0); - -// CHECK_PTX87_SM100a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff2f16x2_rs_relu_satfinite(1.0f, 1.0f, 0); - -// CHECK_PTX87_SM100a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff2bf16x2_rs(1.0f, 1.0f, 0); - -// CHECK_PTX87_SM100a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff2bf16x2_rs_relu(1.0f, 1.0f, 0); - -// CHECK_PTX87_SM100a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff2bf16x2_rs_satfinite(1.0f, 1.0f, 0); - -// CHECK_PTX87_SM100a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) - __nvvm_ff2bf16x2_rs_relu_satfinite(1.0f, 1.0f, 0); - -// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) -// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) - __nvvm_f32x4_to_e4m3x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); - -// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) -// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) - __nvvm_f32x4_to_e4m3x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); - -// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) -// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) - __nvvm_f32x4_to_e5m2x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); - -// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) -// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) - __nvvm_f32x4_to_e5m2x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); - -// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) -// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) - __nvvm_f32x4_to_e2m3x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); - -// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) -// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) - __nvvm_f32x4_to_e2m3x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); - -// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) -// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) - __nvvm_f32x4_to_e3m2x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); - -// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) -// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) - __nvvm_f32x4_to_e3m2x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); - -// CHECK_PTX87_SM100a: call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) -// CHECK_PTX87_SM103a: call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) - __nvvm_f32x4_to_e2m1x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); - -// CHECK_PTX87_SM100a: call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) -// CHECK_PTX87_SM103a: call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) - __nvvm_f32x4_to_e2m1x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); +// CHECK_PTX87_SM100a: %[[R2:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM100a: store <2 x half> %[[R2]], ptr %r2 +// CHECK_PTX87_SM103a: %[[R2:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: store <2 x half> %[[R2]], ptr %r2 + f16x2 r2 = __nvvm_ff2f16x2_rs_relu(1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: %[[R3:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM100a: store <2 x half> %[[R3]], ptr %r3 +// CHECK_PTX87_SM103a: %[[R3:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: store <2 x half> %[[R3]], ptr %r3 + f16x2 r3 = __nvvm_ff2f16x2_rs_satfinite(1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: %[[R4:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM100a: store <2 x half> %[[R4]], ptr %r4 +// CHECK_PTX87_SM103a: %[[R4:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: store <2 x half> %[[R4]], ptr + f16x2 r4 = __nvvm_ff2f16x2_rs_relu_satfinite(1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: %[[R5:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM100a: store <2 x bfloat> %[[R5]], ptr %r5 +// CHECK_PTX87_SM103a: %[[R5:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: store <2 x bfloat> %[[R5]], ptr %r5 + bf16x2 r5 = __nvvm_ff2bf16x2_rs(1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: %[[R6:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM100a: store <2 x bfloat> %[[R6]], ptr %r6 +// CHECK_PTX87_SM103a: %[[R6:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: store <2 x bfloat> %[[R6]], ptr %r6 + bf16x2 r6 = __nvvm_ff2bf16x2_rs_relu(1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: %[[R7:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM100a: store <2 x bfloat> %[[R7]], ptr %r7 +// CHECK_PTX87_SM103a: %[[R7:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: store <2 x bfloat> %[[R7]], ptr %r7 + bf16x2 r7 = __nvvm_ff2bf16x2_rs_satfinite(1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: %[[R8:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM100a: store <2 x bfloat> %[[R8]], ptr %r8 +// CHECK_PTX87_SM103a: %[[R8:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) +// CHECK_PTX87_SM103a: store <2 x bfloat> %[[R8]], ptr %r8 + bf16x2 r8 = __nvvm_ff2bf16x2_rs_relu_satfinite(1.0f, 1.0f, 0); + +// CHECK_PTX87_SM100a: %[[R9:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM100a: store <4 x i8> %[[R9]], ptr %r9 +// CHECK_PTX87_SM103a: %[[R9:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: store <4 x i8> %[[R9]], ptr %r9 + uint8x4 r9 = __nvvm_f32x4_to_e4m3x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); + +// CHECK_PTX87_SM100a: %[[R10:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM100a: store <4 x i8> %[[R10]], ptr %r10 +// CHECK_PTX87_SM103a: %[[R10:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: store <4 x i8> %[[R10]], ptr %r10 + uint8x4 r10 = __nvvm_f32x4_to_e4m3x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); + +// CHECK_PTX87_SM100a: %[[R11:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM100a: store <4 x i8> %[[R11]], ptr %r11 +// CHECK_PTX87_SM103a: %[[R11:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: store <4 x i8> %[[R11]], ptr %r11 + uint8x4 r11 = __nvvm_f32x4_to_e5m2x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); + +// CHECK_PTX87_SM100a: %[[R12:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM100a: store <4 x i8> %[[R12]], ptr %r12 +// CHECK_PTX87_SM103a: %[[R12:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: store <4 x i8> %[[R12]], ptr %r12 + uint8x4 r12 = __nvvm_f32x4_to_e5m2x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); + +// CHECK_PTX87_SM100a: %[[R13:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM100a: store <4 x i8> %[[R13]], ptr %r13 +// CHECK_PTX87_SM103a: %[[R13:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: store <4 x i8> %[[R13]], ptr %r13 + uint8x4 r13 = __nvvm_f32x4_to_e2m3x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); + +// CHECK_PTX87_SM100a: %[[R14:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM100a: store <4 x i8> %[[R14]], ptr %r14 +// CHECK_PTX87_SM103a: %[[R14:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: store <4 x i8> %[[R14]], ptr %r14 + uint8x4 r14 = __nvvm_f32x4_to_e2m3x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); + +// CHECK_PTX87_SM100a: %[[R15:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM100a: store <4 x i8> %[[R15]], ptr %r15 +// CHECK_PTX87_SM103a: %[[R15:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: store <4 x i8> %[[R15]], ptr %r15 + uint8x4 r15 = __nvvm_f32x4_to_e3m2x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); + +// CHECK_PTX87_SM100a: %[[R16:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM100a: store <4 x i8> %[[R16]], ptr %r16 +// CHECK_PTX87_SM103a: %[[R16:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: store <4 x i8> %[[R16]], ptr %r16 + uint8x4 r16 = __nvvm_f32x4_to_e3m2x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); + +// CHECK_PTX87_SM100a: %[[R17:.*]] = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM100a: store i16 %[[R17]], ptr %r17 +// CHECK_PTX87_SM103a: %[[R17:.*]] = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: store i16 %[[R17]], ptr %r17 + short r17 = __nvvm_f32x4_to_e2m1x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); + +// CHECK_PTX87_SM100a: %[[R18:.*]] = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM100a: store i16 %[[R18]], ptr %r18 +// CHECK_PTX87_SM103a: %[[R18:.*]] = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) +// CHECK_PTX87_SM103a: store i16 %[[R18]], ptr %r18 + short r18 = __nvvm_f32x4_to_e2m1x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); #endif } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index fbde2c14a4e02..48b2207aee19e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2855,10 +2855,9 @@ static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG) { // Extract the 4 float elements from the vector SmallVector Ops; - for (unsigned i = 0; i < 4; ++i) { + for (unsigned i = 0; i < 4; ++i) Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec, DAG.getIntPtrConstant(i, DL))); - } using NVPTX::PTXCvtMode::CvtMode; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index e45c6f592eeac..7fe550378d269 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -34,7 +34,7 @@ def CvtRN : PatLeaf<(i32 0x5)>; def CvtRZ : PatLeaf<(i32 0x6)>; def CvtRM : PatLeaf<(i32 0x7)>; def CvtRP : PatLeaf<(i32 0x8)>; -def CvtRNA : PatLeaf<(i32 0x9)>; +def CvtRNA : PatLeaf<(i32 0x9)>; def CvtRS : PatLeaf<(i32 0xA)>; def CvtNONE_FTZ : PatLeaf<(i32 0x10)>; @@ -51,9 +51,9 @@ def CvtSAT : PatLeaf<(i32 0x20)>; def CvtSAT_FTZ : PatLeaf<(i32 0x30)>; def CvtNONE_RELU : PatLeaf<(i32 0x40)>; -def CvtRN_RELU : PatLeaf<(i32 0x45)>; -def CvtRZ_RELU : PatLeaf<(i32 0x46)>; -def CvtRS_RELU : PatLeaf<(i32 0x4A)>; +def CvtRN_RELU : PatLeaf<(i32 0x45)>; +def CvtRZ_RELU : PatLeaf<(i32 0x46)>; +def CvtRS_RELU : PatLeaf<(i32 0x4A)>; def CvtMode : Operand { let PrintMethod = "printCvtMode"; @@ -135,7 +135,9 @@ def hasSM101a : Predicate<"Subtarget->getSmVersion() == 101 && Subtarget->hasArc def hasSM120a : Predicate<"Subtarget->getSmVersion() == 120 && Subtarget->hasArchAccelFeatures()">; def hasSM100aOrSM103a : - Predicate<"(Subtarget->getSmVersion() == 100 || Subtarget->getSmVersion() == 103) && Subtarget->hasArchAccelFeatures()">; + Predicate<"(Subtarget->getSmVersion() == 100 || " # + "Subtarget->getSmVersion() == 103) " # + "&& Subtarget->hasArchAccelFeatures()">; // non-sync shfl instructions are not available on sm_70+ in PTX6.4+ def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" @@ -601,12 +603,14 @@ let hasSideEffects = false in { multiclass CVT_FROM_FLOAT_V2_RS { def _f32_rs : BasicFlagsNVPTXInst<(outs RC:$dst), - (ins B32:$src1, B32:$src2, B32:$src3), (ins CvtMode:$mode), + (ins B32:$src1, B32:$src2, B32:$src3), + (ins CvtMode:$mode), "cvt${mode:base}${mode:relu}." # FromName # ".f32">; def _f32_rs_sf : BasicFlagsNVPTXInst<(outs RC:$dst), - (ins B32:$src1, B32:$src2, B32:$src3), (ins CvtMode:$mode), + (ins B32:$src1, B32:$src2, B32:$src3), + (ins CvtMode:$mode), "cvt${mode:base}${mode:relu}.satfinite." # FromName # ".f32">; } @@ -641,9 +645,9 @@ let hasSideEffects = false in { class CVT_TO_FP8X4 : NVPTXInst<(outs B32:$dst), - (ins B32:$src1, B32:$src2, B32:$src3, B32:$src4, B32:$src5, - CvtMode:$mode), - "cvt${mode:base}${mode:relu}.satfinite." # F8Name # "x4.f32 \t$dst, {{$src1, $src2, $src3, $src4}}, $src5;">; + (ins B32:$src1, B32:$src2, B32:$src3, B32:$src4, B32:$src5, CvtMode:$mode), + "cvt${mode:base}${mode:relu}.satfinite." # F8Name # + "x4.f32 \t$dst, {{$src1, $src2, $src3, $src4}}, $src5;">; def CVT_e4m3x4_f32x4_rs_sf : CVT_TO_FP8X4<"e4m3">; def CVT_e5m2x4_f32x4_rs_sf : CVT_TO_FP8X4<"e5m2">; @@ -682,9 +686,9 @@ let hasSideEffects = false in { class CVT_TO_FP6X4 : NVPTXInst<(outs B32:$dst), - (ins B32:$src1, B32:$src2, B32:$src3, B32:$src4, B32:$src5, - CvtMode:$mode), - "cvt${mode:base}${mode:relu}.satfinite." # F6Name # "x4.f32 \t$dst, {{$src1, $src2, $src3, $src4}}, $src5;">; + (ins B32:$src1, B32:$src2, B32:$src3, B32:$src4, B32:$src5, CvtMode:$mode), + "cvt${mode:base}${mode:relu}.satfinite." # F6Name # + "x4.f32 \t$dst, {{$src1, $src2, $src3, $src4}}, $src5;">; def CVT_e2m3x4_f32x4_rs_sf : CVT_TO_FP6X4<"e2m3">; def CVT_e3m2x4_f32x4_rs_sf : CVT_TO_FP6X4<"e3m2">; @@ -708,9 +712,9 @@ let hasSideEffects = false in { def CVT_e2m1x4_f32x4_rs_sf : NVPTXInst<(outs B16:$dst), - (ins B32:$src1, B32:$src2, B32:$src3, B32:$src4, B32:$src5, - CvtMode:$mode), - "cvt${mode:base}${mode:relu}.satfinite.e2m1x4.f32 \t$dst, {{$src1, $src2, $src3, $src4}}, $src5;">; + (ins B32:$src1, B32:$src2, B32:$src3, B32:$src4, B32:$src5, CvtMode:$mode), + "cvt${mode:base}${mode:relu}.satfinite.e2m1x4.f32 \t" # + "$dst, {{$src1, $src2, $src3, $src4}}, $src5;">; // UE8M0x2 conversions. class CVT_f32_to_ue8m0x2 : From 4abab6e882ae16ea239cd7d8dba83d71ccba2a37 Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi Date: Wed, 1 Oct 2025 06:30:18 +0000 Subject: [PATCH 8/8] minor test fix --- clang/test/CodeGen/builtins-nvptx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 4f14e9dcc8a72..e3be262622844 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -1237,7 +1237,7 @@ __device__ void nvvm_cvt_sm100a_sm103a() { // CHECK_PTX87_SM100a: %[[R4:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) // CHECK_PTX87_SM100a: store <2 x half> %[[R4]], ptr %r4 // CHECK_PTX87_SM103a: %[[R4:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) -// CHECK_PTX87_SM103a: store <2 x half> %[[R4]], ptr +// CHECK_PTX87_SM103a: store <2 x half> %[[R4]], ptr %r4 f16x2 r4 = __nvvm_ff2f16x2_rs_relu_satfinite(1.0f, 1.0f, 0); // CHECK_PTX87_SM100a: %[[R5:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0)