diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td index 61e48b31c244b..bdbdfa2cea6c6 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.td +++ b/clang/include/clang/Basic/BuiltinsNVPTX.td @@ -580,6 +580,15 @@ def __nvvm_f2bf16_rz : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>; +def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_89, PTX81>; +def __nvvm_f2tf32_rn : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>; +def __nvvm_f2tf32_rn_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>; +def __nvvm_f2tf32_rn_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>; +def __nvvm_f2tf32_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>; +def __nvvm_f2tf32_rz : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>; +def __nvvm_f2tf32_rz_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>; +def __nvvm_f2tf32_rz_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>; +def __nvvm_f2tf32_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>; def __nvvm_ff_to_e4m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", SM_89, PTX81>; def __nvvm_ff_to_e4m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", SM_89, PTX81>; @@ -596,6 +605,28 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>; def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>; +def __nvvm_ff_to_e2m3x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_e2m3x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_e3m2x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_e3m2x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + +def __nvvm_e2m3x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + +def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + +def __nvvm_bf16x2_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_bf16x2_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_bf16x2_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_bf16x2_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + +def __nvvm_ue8m0x2_to_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + // FNS let Attributes = [NoThrow] in { def __nvvm_fns : NVPTXBuiltinPTX<"unsigned int(unsigned int, unsigned int, int)", PTX60>; diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index e2c159aac903f..7404ce01c535c 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -25,14 +25,29 @@ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s -// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \ +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 -DPTX=81\ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_90 -target-feature +ptx78 -DPTX=78 \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX78_SM90 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100 -target-feature +ptx86 -DPTX=86 \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx86 -DPTX=86 \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100a %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_101a -target-feature +ptx86 -DPTX=86 \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM101a %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx86 -DPTX=86 \ +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s // ### The last run to check with the highest SM and PTX version available // ### to make sure target builtins are still accepted. -// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 \ +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx87 -DPTX=87 \ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ -// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -995,7 +1010,7 @@ __device__ void nvvm_cvt_sm80() { // CHECK-LABEL: nvvm_cvt_sm89 __device__ void nvvm_cvt_sm89() { -#if __CUDA_ARCH__ >= 890 +#if (PTX >= 81) && (__CUDA_ARCH__ >= 890) // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.000000e+00, float 1.000000e+00) __nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f); // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00) @@ -1022,6 +1037,133 @@ __device__ void nvvm_cvt_sm89() { __nvvm_e5m2x2_to_f16x2_rn(0x4c4c); // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532) __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c); + + // CHECK_PTX81_SM89: call i32 @llvm.nvvm.f2tf32.rna.satfinite(float 1.000000e+00) + __nvvm_f2tf32_rna_satfinite(1.0f); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_cvt_sm90 +__device__ void nvvm_cvt_sm90() { +#if (PTX >= 78) && (__CUDA_ARCH__ >= 900) + // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn(float 1.000000e+00) + __nvvm_f2tf32_rn(1.0f); + // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn.relu(float 1.000000e+00) + __nvvm_f2tf32_rn_relu(1.0f); + // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz(float 1.000000e+00) + __nvvm_f2tf32_rz(1.0f); + // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz.relu(float 1.000000e+00) + __nvvm_f2tf32_rz_relu(1.0f); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_cvt_sm100 +__device__ void nvvm_cvt_sm100() { +#if (PTX >= 86) && (__CUDA_ARCH__ >= 1000) + // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.satfinite(float 1.000000e+00) + __nvvm_f2tf32_rn_satfinite(1.0f); + // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float 1.000000e+00) + __nvvm_f2tf32_rn_relu_satfinite(1.0f); + // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.satfinite(float 1.000000e+00) + __nvvm_f2tf32_rz_satfinite(1.0f); + // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float 1.000000e+00) + __nvvm_f2tf32_rz_relu_satfinite(1.0f); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_cvt_sm100a_sm101a_sm120a +__device__ void nvvm_cvt_sm100a_sm101a_sm120a() { +#if (PTX >= 86) && \ + (__CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL || \ + __CUDA_ARCH_FEAT_SM120_ALL) + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e2m3x2_rn_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e2m3x2_rn_relu_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e3m2x2_rn_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e3m2x2_rn_relu_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532) + // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532) + // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532) + __nvvm_e2m3x2_to_f16x2_rn(0x4C4C); + + // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504) + // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504) + // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504) + __nvvm_e2m3x2_to_f16x2_rn_relu(0x4848); + + // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504) + // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504) + // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504) + __nvvm_e3m2x2_to_f16x2_rn(0x4848); + + // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532) + // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532) + // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532) + __nvvm_e3m2x2_to_f16x2_rn_relu(0x4C4C); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rz(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rz_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rp(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rp_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rz({(__bf16)0.1f, (__bf16)0.1f}); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rz_satfinite({(__bf16)0.1f, (__bf16)0.1f}); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rp({(__bf16)0.1f, (__bf16)0.1f}); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rp_satfinite({(__bf16)0.1f, (__bf16)0.1f}); + + // CHECK_PTX86_SM100a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532) + // CHECK_PTX86_SM101a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532) + // CHECK_PTX86_SM120a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532) + __nvvm_ue8m0x2_to_bf16x2(0x4C4C); + #endif // CHECK: ret void } diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 4aeb1d8a2779e..7af051d22f9b7 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1628,6 +1628,37 @@ let TargetPrefix = "nvvm" in { Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; def int_nvvm_e5m2x2_to_f16x2_rn_relu : ClangBuiltin<"__nvvm_e5m2x2_to_f16x2_rn_relu">, Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; + + class CVT_FF_TO_I16 : ClangBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + + class CVT_I16_TO_F16X2 + : ClangBuiltin, + DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; + + class CVT_BF16X2_TO_I16 : ClangBuiltin, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty], [IntrNoMem, IntrNoCallback]>; + + // FP6 conversions. + foreach type = ["e2m3x2", "e3m2x2"] in { + foreach relu = ["", "_relu"] in { + defvar suffix = !strconcat("_rn", relu); + def int_nvvm_ff_to_ # type # suffix # _satfinite : CVT_FF_TO_I16; + def int_nvvm_ # type # _to_f16x2 # suffix : CVT_I16_TO_F16X2; + } + } + + // UE8M0x2 conversions. + foreach rmode = ["_rz", "_rp"] in { + foreach satmode = ["", "_satfinite"] in { + defvar suffix = !strconcat(rmode, satmode); + def int_nvvm_ff_to_ue8m0x2 # suffix : CVT_FF_TO_I16<"ue8m0x2", suffix>; + def int_nvvm_bf16x2_to_ue8m0x2 # suffix : CVT_BF16X2_TO_I16<"ue8m0x2", suffix>; + } + } + + def int_nvvm_ue8m0x2_to_bf16x2 : ClangBuiltin<"__nvvm_ue8m0x2_to_bf16x2">, + Intrinsic<[llvm_v2bf16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; // FNS diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 16b489afddf5c..d97c8343f1544 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -703,6 +703,41 @@ let hasSideEffects = false in { defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, hasSM<100>]>; defm CVT_to_tf32_rn_relu_satf : CVT_TO_TF32<"rn.relu.satfinite", [hasPTX<86>, hasSM<100>]>; defm CVT_to_tf32_rz_relu_satf : CVT_TO_TF32<"rz.relu.satfinite", [hasPTX<86>, hasSM<100>]>; + + // FP6 conversions. + foreach type = ["e2m3x2", "e3m2x2"] in { + def CVT_ # type # _f32_sf : NVPTXInst<(outs Int16Regs:$dst), + (ins Float32Regs:$src1, + Float32Regs:$src2, CvtMode:$mode), + "cvt${mode:base}.satfinite${mode:relu}." + # type # ".f32 \t$dst, $src1, $src2;", []>; + def CVT_f16x2_ # type : NVPTXInst<(outs Int32Regs:$dst), + (ins Int16Regs:$src, CvtMode:$mode), + "cvt${mode:base}${mode:relu}.f16x2." + # type # " \t$dst, $src;", []>; + } + + // UE8M0x2 conversions. + class CVT_f32_to_ue8m0x2 : + NVPTXInst<(outs Int16Regs:$dst), + (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode), + "cvt${mode:base}" # sat # ".ue8m0x2.f32 \t$dst, $src1, $src2;", []>; + + class CVT_bf16x2_to_ue8m0x2 : + NVPTXInst<(outs Int16Regs:$dst), + (ins Int32Regs:$src, CvtMode:$mode), + "cvt${mode:base}" # sat # ".ue8m0x2.bf16x2 \t$dst, $src;", []>; + + def CVT_ue8m0x2_f32 : CVT_f32_to_ue8m0x2; + def CVT_ue8m0x2_f32_sf : CVT_f32_to_ue8m0x2<".satfinite">; + def CVT_ue8m0x2_bf16x2 : CVT_bf16x2_to_ue8m0x2; + def CVT_ue8m0x2_bf16x2_sf : CVT_bf16x2_to_ue8m0x2<".satfinite">; + + def CVT_bf16x2_ue8m0x2 : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int16Regs:$src), + "cvt.rn.bf16x2.ue8m0x2 \t$dst, $src;", []>; + } def fpround_oneuse : OneUse1; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 8528ff702f236..b5d3c9a05ec34 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1944,6 +1944,62 @@ def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn Int16Regs:$a), def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu Int16Regs:$a), (CVT_f16x2_e5m2x2 $a, CvtRN_RELU)>; +def : Pat<(int_nvvm_ff_to_e2m3x2_rn_satfinite f32:$a, f32:$b), + (CVT_e2m3x2_f32_sf $a, $b, CvtRN)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_ff_to_e2m3x2_rn_relu_satfinite f32:$a, f32:$b), + (CVT_e2m3x2_f32_sf $a, $b, CvtRN_RELU)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_ff_to_e3m2x2_rn_satfinite f32:$a, f32:$b), + (CVT_e3m2x2_f32_sf $a, $b, CvtRN)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_ff_to_e3m2x2_rn_relu_satfinite f32:$a, f32:$b), + (CVT_e3m2x2_f32_sf $a, $b, CvtRN_RELU)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; + +def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn i16:$a), + (CVT_f16x2_e2m3x2 $a, CvtRN)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn_relu i16:$a), + (CVT_f16x2_e2m3x2 $a, CvtRN_RELU)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn i16:$a), + (CVT_f16x2_e3m2x2 $a, CvtRN)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn_relu i16:$a), + (CVT_f16x2_e3m2x2 $a, CvtRN_RELU)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; + +def : Pat<(int_nvvm_ff_to_ue8m0x2_rz f32:$a, f32:$b), + (CVT_ue8m0x2_f32 $a, $b, CvtRZ)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_ff_to_ue8m0x2_rz_satfinite f32:$a, f32:$b), + (CVT_ue8m0x2_f32_sf $a, $b, CvtRZ)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_ff_to_ue8m0x2_rp f32:$a, f32:$b), + (CVT_ue8m0x2_f32 $a, $b, CvtRP)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_ff_to_ue8m0x2_rp_satfinite f32:$a, f32:$b), + (CVT_ue8m0x2_f32_sf $a, $b, CvtRP)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; + +def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz Int32Regs:$a), + (CVT_ue8m0x2_bf16x2 $a, CvtRZ)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz_satfinite Int32Regs:$a), + (CVT_ue8m0x2_bf16x2_sf $a, CvtRZ)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp Int32Regs:$a), + (CVT_ue8m0x2_bf16x2 $a, CvtRP)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp_satfinite Int32Regs:$a), + (CVT_ue8m0x2_bf16x2_sf $a, CvtRP)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; + +def : Pat<(int_nvvm_ue8m0x2_to_bf16x2 i16:$a), + (CVT_bf16x2_ue8m0x2 $a)>, + Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; + // // FNS // diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll new file mode 100644 index 0000000000000..f0dd5f084026b --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll @@ -0,0 +1,290 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | FileCheck %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | FileCheck %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 | %ptxas-verify -arch=sm_120a %} + +define i16 @cvt_rn_sf_e2m3x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_sf_e2m3x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_sf_e2m3x2_f32_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_sf_e2m3x2_f32_param_1]; +; CHECK-NEXT: cvt.rn.satfinite.e2m3x2.f32 %rs1, %f1, %f2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float %f1, float %f2) + ret i16 %val +} + +define i16 @cvt_rn_relu_sf_e2m3x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_relu_sf_e2m3x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_sf_e2m3x2_f32_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_relu_sf_e2m3x2_f32_param_1]; +; CHECK-NEXT: cvt.rn.satfinite.relu.e2m3x2.f32 %rs1, %f1, %f2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu.satfinite(float %f1, float %f2) + ret i16 %val +} + +define i16 @cvt_rn_sf_e3m2x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_sf_e3m2x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_sf_e3m2x2_f32_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_sf_e3m2x2_f32_param_1]; +; CHECK-NEXT: cvt.rn.satfinite.e3m2x2.f32 %rs1, %f1, %f2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float %f1, float %f2) + ret i16 %val +} + +define i16 @cvt_rn_relu_sf_e3m2x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_relu_sf_e3m2x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_sf_e3m2x2_f32_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_relu_sf_e3m2x2_f32_param_1]; +; CHECK-NEXT: cvt.rn.satfinite.relu.e3m2x2.f32 %rs1, %f1, %f2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu.satfinite(float %f1, float %f2) + ret i16 %val +} + +define <2 x half> @cvt_rn_f16x2_e2m3x2(i16 %in) { +; CHECK-LABEL: cvt_rn_f16x2_e2m3x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %rs1, [cvt_rn_f16x2_e2m3x2_param_0]; +; CHECK-NEXT: cvt.rn.f16x2.e2m3x2 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 %in) + ret <2 x half> %val +} + +define <2 x half> @cvt_rn_relu_f16x2_e2m3x2_relu(i16 %in) { +; CHECK-LABEL: cvt_rn_relu_f16x2_e2m3x2_relu( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %rs1, [cvt_rn_relu_f16x2_e2m3x2_relu_param_0]; +; CHECK-NEXT: cvt.rn.relu.f16x2.e2m3x2 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 %in) + ret <2 x half> %val +} + +define <2 x half> @cvt_rn_f16x2_e3m2x2(i16 %in) { +; CHECK-LABEL: cvt_rn_f16x2_e3m2x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %rs1, [cvt_rn_f16x2_e3m2x2_param_0]; +; CHECK-NEXT: cvt.rn.f16x2.e3m2x2 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 %in) + ret <2 x half> %val +} + +define <2 x half> @cvt_rn_relu_f16x2_e3m2x2(i16 %in) { +; CHECK-LABEL: cvt_rn_relu_f16x2_e3m2x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %rs1, [cvt_rn_relu_f16x2_e3m2x2_param_0]; +; CHECK-NEXT: cvt.rn.relu.f16x2.e3m2x2 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 %in) + ret <2 x half> %val +} + +define i16 @cvt_rz_ue8m0x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rz_ue8m0x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_ue8m0x2_f32_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [cvt_rz_ue8m0x2_f32_param_1]; +; CHECK-NEXT: cvt.rz.ue8m0x2.f32 %rs1, %f1, %f2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float %f1, float %f2) + ret i16 %val +} + +define i16 @cvt_rz_sf_ue8m0x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rz_sf_ue8m0x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_sf_ue8m0x2_f32_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [cvt_rz_sf_ue8m0x2_f32_param_1]; +; CHECK-NEXT: cvt.rz.satfinite.ue8m0x2.f32 %rs1, %f1, %f2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float %f1, float %f2) + ret i16 %val +} + +define i16 @cvt_rp_ue8m0x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rp_ue8m0x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rp_ue8m0x2_f32_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [cvt_rp_ue8m0x2_f32_param_1]; +; CHECK-NEXT: cvt.rp.ue8m0x2.f32 %rs1, %f1, %f2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float %f1, float %f2) + ret i16 %val +} + +define i16 @cvt_rp_sf_ue8m0x2_f32(float %f1, float %f2) { +; CHECK-LABEL: cvt_rp_sf_ue8m0x2_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [cvt_rp_sf_ue8m0x2_f32_param_0]; +; CHECK-NEXT: ld.param.f32 %f2, [cvt_rp_sf_ue8m0x2_f32_param_1]; +; CHECK-NEXT: cvt.rp.satfinite.ue8m0x2.f32 %rs1, %f1, %f2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float %f1, float %f2) + ret i16 %val +} + +define i16 @cvt_rz_ue8m0x2_bf16x2(<2 x bfloat> %in) { +; CHECK-LABEL: cvt_rz_ue8m0x2_bf16x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_ue8m0x2_bf16x2_param_0]; +; CHECK-NEXT: cvt.rz.ue8m0x2.bf16x2 %rs1, %r1; +; CHECK-NEXT: cvt.u32.u16 %r2, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> %in) + ret i16 %val +} + +define i16 @cvt_rz_sf_ue8m0x2_bf16x2(<2 x bfloat> %in) { +; CHECK-LABEL: cvt_rz_sf_ue8m0x2_bf16x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_sf_ue8m0x2_bf16x2_param_0]; +; CHECK-NEXT: cvt.rz.satfinite.ue8m0x2.bf16x2 %rs1, %r1; +; CHECK-NEXT: cvt.u32.u16 %r2, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> %in) + ret i16 %val +} + +define i16 @cvt_rp_ue8m0x2_bf16x2(<2 x bfloat> %in) { +; CHECK-LABEL: cvt_rp_ue8m0x2_bf16x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rp_ue8m0x2_bf16x2_param_0]; +; CHECK-NEXT: cvt.rp.ue8m0x2.bf16x2 %rs1, %r1; +; CHECK-NEXT: cvt.u32.u16 %r2, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> %in) + ret i16 %val +} + +define i16 @cvt_rp_sf_ue8m0x2_bf16x2(<2 x bfloat> %in) { +; CHECK-LABEL: cvt_rp_sf_ue8m0x2_bf16x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rp_sf_ue8m0x2_bf16x2_param_0]; +; CHECK-NEXT: cvt.rp.satfinite.ue8m0x2.bf16x2 %rs1, %r1; +; CHECK-NEXT: cvt.u32.u16 %r2, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %val = call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> %in) + ret i16 %val +} + +define <2 x bfloat> @cvt_bf16x2_ue8m0x2(i16 %in) { +; CHECK-LABEL: cvt_bf16x2_ue8m0x2( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %rs1, [cvt_bf16x2_ue8m0x2_param_0]; +; CHECK-NEXT: cvt.rn.bf16x2.ue8m0x2 %r1, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 %in) + ret <2 x bfloat> %val +}