Skip to content

Commit 184b9a4

Browse files
committed
[NVPTX] Add intrinsics for cvt .f6x2 and .ue8m0x2 variants
This change adds NVVM intrinsics and clang builtins for the cvt instruction variants of types .e2m3x2, .e3m2x2, and .ue8m0x2 introduced in PTX 8.6 for `sm_100a`, `sm_101a`, and `sm_120a`. Tests are added in `NVPTX/convert-sm1XXa.ll` and `clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0. PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt
1 parent 3295970 commit 184b9a4

File tree

6 files changed

+650
-4
lines changed

6 files changed

+650
-4
lines changed

clang/include/clang/Basic/BuiltinsNVPTX.td

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,15 @@ def __nvvm_f2bf16_rz : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
580580
def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
581581

582582
def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>;
583+
def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_89, PTX81>;
584+
def __nvvm_f2tf32_rn : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
585+
def __nvvm_f2tf32_rn_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
586+
def __nvvm_f2tf32_rn_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>;
587+
def __nvvm_f2tf32_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>;
588+
def __nvvm_f2tf32_rz : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
589+
def __nvvm_f2tf32_rz_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
590+
def __nvvm_f2tf32_rz_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>;
591+
def __nvvm_f2tf32_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>;
583592

584593
def __nvvm_ff_to_e4m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", SM_89, PTX81>;
585594
def __nvvm_ff_to_e4m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", SM_89, PTX81>;
@@ -596,6 +605,28 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
596605
def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>;
597606
def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>;
598607

608+
def __nvvm_ff_to_e2m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
609+
def __nvvm_ff_to_e2m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
610+
def __nvvm_ff_to_e3m2x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
611+
def __nvvm_ff_to_e3m2x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
612+
613+
def __nvvm_e2m3x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
614+
def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
615+
def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
616+
def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
617+
618+
def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
619+
def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
620+
def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
621+
def __nvvm_ff_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
622+
623+
def __nvvm_bf16x2_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
624+
def __nvvm_bf16x2_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
625+
def __nvvm_bf16x2_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
626+
def __nvvm_bf16x2_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
627+
628+
def __nvvm_ue8m0x2_to_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
629+
599630
// FNS
600631
let Attributes = [NoThrow] in {
601632
def __nvvm_fns : NVPTXBuiltinPTX<"unsigned int(unsigned int, unsigned int, int)", PTX60>;

clang/test/CodeGen/builtins-nvptx.c

Lines changed: 187 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,29 @@
2525
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
2626
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
2727
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
28-
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \
28+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 -DPTX=81\
2929
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
3030
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
31+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_90 -target-feature +ptx78 -DPTX=78 \
32+
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
33+
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX78_SM90 %s
34+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100 -target-feature +ptx86 -DPTX=86 \
35+
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
36+
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100 %s
37+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx86 -DPTX=86 \
38+
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
39+
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100a %s
40+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_101a -target-feature +ptx86 -DPTX=86 \
41+
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
42+
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM101a %s
43+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx86 -DPTX=86 \
44+
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
45+
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s
3146
// ### The last run to check with the highest SM and PTX version available
3247
// ### to make sure target builtins are still accepted.
33-
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 \
48+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx87 -DPTX=87 \
3449
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
35-
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
50+
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s
3651

3752
#define __device__ __attribute__((device))
3853
#define __global__ __attribute__((global))
@@ -994,7 +1009,7 @@ __device__ void nvvm_cvt_sm80() {
9941009

9951010
// CHECK-LABEL: nvvm_cvt_sm89
9961011
__device__ void nvvm_cvt_sm89() {
997-
#if __CUDA_ARCH__ >= 890
1012+
#if (PTX >= 81) && (__CUDA_ARCH__ >= 890)
9981013
// CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.000000e+00, float 1.000000e+00)
9991014
__nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f);
10001015
// CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
@@ -1021,6 +1036,174 @@ __device__ void nvvm_cvt_sm89() {
10211036
__nvvm_e5m2x2_to_f16x2_rn(0x4c4c);
10221037
// CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532)
10231038
__nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c);
1039+
1040+
// CHECK_PTX81_SM89: call i32 @llvm.nvvm.f2tf32.rna.satfinite(float 1.000000e+00)
1041+
__nvvm_f2tf32_rna_satfinite(1.0f);
1042+
#endif
1043+
// CHECK: ret void
1044+
}
1045+
1046+
// CHECK-LABEL: nvvm_cvt_sm90
1047+
__device__ void nvvm_cvt_sm90() {
1048+
#if (PTX >= 78) && (__CUDA_ARCH__ >= 900)
1049+
// CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn(float 1.000000e+00)
1050+
__nvvm_f2tf32_rn(1.0f);
1051+
// CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn.relu(float 1.000000e+00)
1052+
__nvvm_f2tf32_rn_relu(1.0f);
1053+
// CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz(float 1.000000e+00)
1054+
__nvvm_f2tf32_rz(1.0f);
1055+
// CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz.relu(float 1.000000e+00)
1056+
__nvvm_f2tf32_rz_relu(1.0f);
1057+
#endif
1058+
// CHECK: ret void
1059+
}
1060+
1061+
// CHECK-LABEL: nvvm_cvt_sm100
1062+
__device__ void nvvm_cvt_sm100() {
1063+
#if (PTX >= 86) && (__CUDA_ARCH__ >= 1000)
1064+
// CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.satfinite(float 1.000000e+00)
1065+
__nvvm_f2tf32_rn_satfinite(1.0f);
1066+
// CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float 1.000000e+00)
1067+
__nvvm_f2tf32_rn_relu_satfinite(1.0f);
1068+
// CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.satfinite(float 1.000000e+00)
1069+
__nvvm_f2tf32_rz_satfinite(1.0f);
1070+
// CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float 1.000000e+00)
1071+
__nvvm_f2tf32_rz_relu_satfinite(1.0f);
1072+
#endif
1073+
// CHECK: ret void
1074+
}
1075+
1076+
// CHECK-LABEL: nvvm_cvt_sm100a
1077+
__device__ void nvvm_cvt_sm100a() {
1078+
#if (PTX >= 86) && __CUDA_ARCH_FEAT_SM100_ALL
1079+
// CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn(float 1.000000e+00, float 1.000000e+00)
1080+
__nvvm_ff_to_e2m3x2_rn(1.0f, 1.0f);
1081+
// CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1082+
__nvvm_ff_to_e2m3x2_rn_relu(1.0f, 1.0f);
1083+
// CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn(float 1.000000e+00, float 1.000000e+00)
1084+
__nvvm_ff_to_e3m2x2_rn(1.0f, 1.0f);
1085+
// CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1086+
__nvvm_ff_to_e3m2x2_rn_relu(1.0f, 1.0f);
1087+
1088+
// CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
1089+
__nvvm_e2m3x2_to_f16x2_rn(0x4C4C);
1090+
// CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
1091+
__nvvm_e2m3x2_to_f16x2_rn_relu(0x4848);
1092+
// CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
1093+
__nvvm_e3m2x2_to_f16x2_rn(0x4848);
1094+
// CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
1095+
__nvvm_e3m2x2_to_f16x2_rn_relu(0x4C4C);
1096+
1097+
// CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
1098+
__nvvm_ff_to_ue8m0x2_rz(1.0f, 1.0f);
1099+
// CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
1100+
__nvvm_ff_to_ue8m0x2_rz_satfinite(1.0f, 1.0f);
1101+
// CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
1102+
__nvvm_ff_to_ue8m0x2_rp(1.0f, 1.0f);
1103+
// CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
1104+
__nvvm_ff_to_ue8m0x2_rp_satfinite(1.0f, 1.0f);
1105+
1106+
// CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
1107+
__nvvm_bf16x2_to_ue8m0x2_rz({(__bf16)0.1f, (__bf16)0.1f});
1108+
// CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1109+
__nvvm_bf16x2_to_ue8m0x2_rz_satfinite({(__bf16)0.1f, (__bf16)0.1f});
1110+
// CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
1111+
__nvvm_bf16x2_to_ue8m0x2_rp({(__bf16)0.1f, (__bf16)0.1f});
1112+
// CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1113+
__nvvm_bf16x2_to_ue8m0x2_rp_satfinite({(__bf16)0.1f, (__bf16)0.1f});
1114+
1115+
// CHECK_PTX86_SM100a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
1116+
__nvvm_ue8m0x2_to_bf16x2(0x4C4C);
1117+
#endif
1118+
// CHECK: ret void
1119+
}
1120+
1121+
// CHECK-LABEL: nvvm_cvt_sm101a
1122+
__device__ void nvvm_cvt_sm101a() {
1123+
#if (PTX >= 86) && __CUDA_ARCH_FEAT_SM101_ALL
1124+
// CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn(float 1.000000e+00, float 1.000000e+00)
1125+
__nvvm_ff_to_e2m3x2_rn(1.0f, 1.0f);
1126+
// CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1127+
__nvvm_ff_to_e2m3x2_rn_relu(1.0f, 1.0f);
1128+
// CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn(float 1.000000e+00, float 1.000000e+00)
1129+
__nvvm_ff_to_e3m2x2_rn(1.0f, 1.0f);
1130+
// CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1131+
__nvvm_ff_to_e3m2x2_rn_relu(1.0f, 1.0f);
1132+
1133+
// CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
1134+
__nvvm_e2m3x2_to_f16x2_rn(0x4C4C);
1135+
// CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
1136+
__nvvm_e2m3x2_to_f16x2_rn_relu(0x4848);
1137+
// CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
1138+
__nvvm_e3m2x2_to_f16x2_rn(0x4848);
1139+
// CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
1140+
__nvvm_e3m2x2_to_f16x2_rn_relu(0x4C4C);
1141+
1142+
// CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
1143+
__nvvm_ff_to_ue8m0x2_rz(1.0f, 1.0f);
1144+
// CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
1145+
__nvvm_ff_to_ue8m0x2_rz_satfinite(1.0f, 1.0f);
1146+
// CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
1147+
__nvvm_ff_to_ue8m0x2_rp(1.0f, 1.0f);
1148+
// CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
1149+
__nvvm_ff_to_ue8m0x2_rp_satfinite(1.0f, 1.0f);
1150+
1151+
// CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
1152+
__nvvm_bf16x2_to_ue8m0x2_rz({(__bf16)0.1f, (__bf16)0.1f});
1153+
// CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1154+
__nvvm_bf16x2_to_ue8m0x2_rz_satfinite({(__bf16)0.1f, (__bf16)0.1f});
1155+
// CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
1156+
__nvvm_bf16x2_to_ue8m0x2_rp({(__bf16)0.1f, (__bf16)0.1f});
1157+
// CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1158+
__nvvm_bf16x2_to_ue8m0x2_rp_satfinite({(__bf16)0.1f, (__bf16)0.1f});
1159+
1160+
// CHECK_PTX86_SM101a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
1161+
__nvvm_ue8m0x2_to_bf16x2(0x4C4C);
1162+
#endif
1163+
// CHECK: ret void
1164+
}
1165+
1166+
// CHECK-LABEL: nvvm_cvt_sm120a
1167+
__device__ void nvvm_cvt_sm120a() {
1168+
#if (PTX >= 86) && __CUDA_ARCH_FEAT_SM120_ALL
1169+
// CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn(float 1.000000e+00, float 1.000000e+00)
1170+
__nvvm_ff_to_e2m3x2_rn(1.0f, 1.0f);
1171+
// CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1172+
__nvvm_ff_to_e2m3x2_rn_relu(1.0f, 1.0f);
1173+
// CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn(float 1.000000e+00, float 1.000000e+00)
1174+
__nvvm_ff_to_e3m2x2_rn(1.0f, 1.0f);
1175+
// CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1176+
__nvvm_ff_to_e3m2x2_rn_relu(1.0f, 1.0f);
1177+
1178+
// CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
1179+
__nvvm_e2m3x2_to_f16x2_rn(0x4C4C);
1180+
// CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
1181+
__nvvm_e2m3x2_to_f16x2_rn_relu(0x4848);
1182+
// CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
1183+
__nvvm_e3m2x2_to_f16x2_rn(0x4848);
1184+
// CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
1185+
__nvvm_e3m2x2_to_f16x2_rn_relu(0x4C4C);
1186+
1187+
// CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
1188+
__nvvm_ff_to_ue8m0x2_rz(1.0f, 1.0f);
1189+
// CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
1190+
__nvvm_ff_to_ue8m0x2_rz_satfinite(1.0f, 1.0f);
1191+
// CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
1192+
__nvvm_ff_to_ue8m0x2_rp(1.0f, 1.0f);
1193+
// CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
1194+
__nvvm_ff_to_ue8m0x2_rp_satfinite(1.0f, 1.0f);
1195+
1196+
// CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
1197+
__nvvm_bf16x2_to_ue8m0x2_rz({(__bf16)0.1f, (__bf16)0.1f});
1198+
// CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1199+
__nvvm_bf16x2_to_ue8m0x2_rz_satfinite({(__bf16)0.1f, (__bf16)0.1f});
1200+
// CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
1201+
__nvvm_bf16x2_to_ue8m0x2_rp({(__bf16)0.1f, (__bf16)0.1f});
1202+
// CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1203+
__nvvm_bf16x2_to_ue8m0x2_rp_satfinite({(__bf16)0.1f, (__bf16)0.1f});
1204+
1205+
// CHECK_PTX86_SM120a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
1206+
__nvvm_ue8m0x2_to_bf16x2(0x4C4C);
10241207
#endif
10251208
// CHECK: ret void
10261209
}

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1548,6 +1548,45 @@ let TargetPrefix = "nvvm" in {
15481548
Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
15491549
def int_nvvm_e5m2x2_to_f16x2_rn_relu : ClangBuiltin<"__nvvm_e5m2x2_to_f16x2_rn_relu">,
15501550
Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
1551+
1552+
def int_nvvm_ff_to_e2m3x2_rn : ClangBuiltin<"__nvvm_ff_to_e2m3x2_rn">,
1553+
Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
1554+
def int_nvvm_ff_to_e2m3x2_rn_relu : ClangBuiltin<"__nvvm_ff_to_e2m3x2_rn_relu">,
1555+
Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
1556+
def int_nvvm_ff_to_e3m2x2_rn : ClangBuiltin<"__nvvm_ff_to_e3m2x2_rn">,
1557+
Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
1558+
def int_nvvm_ff_to_e3m2x2_rn_relu : ClangBuiltin<"__nvvm_ff_to_e3m2x2_rn_relu">,
1559+
Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
1560+
1561+
def int_nvvm_e2m3x2_to_f16x2_rn : ClangBuiltin<"__nvvm_e2m3x2_to_f16x2_rn">,
1562+
Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
1563+
def int_nvvm_e2m3x2_to_f16x2_rn_relu : ClangBuiltin<"__nvvm_e2m3x2_to_f16x2_rn_relu">,
1564+
Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
1565+
def int_nvvm_e3m2x2_to_f16x2_rn : ClangBuiltin<"__nvvm_e3m2x2_to_f16x2_rn">,
1566+
Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
1567+
def int_nvvm_e3m2x2_to_f16x2_rn_relu : ClangBuiltin<"__nvvm_e3m2x2_to_f16x2_rn_relu">,
1568+
Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
1569+
1570+
def int_nvvm_ff_to_ue8m0x2_rz : ClangBuiltin<"__nvvm_ff_to_ue8m0x2_rz">,
1571+
Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
1572+
def int_nvvm_ff_to_ue8m0x2_rz_satfinite : ClangBuiltin<"__nvvm_ff_to_ue8m0x2_rz_satfinite">,
1573+
Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
1574+
def int_nvvm_ff_to_ue8m0x2_rp : ClangBuiltin<"__nvvm_ff_to_ue8m0x2_rp">,
1575+
Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
1576+
def int_nvvm_ff_to_ue8m0x2_rp_satfinite : ClangBuiltin<"__nvvm_ff_to_ue8m0x2_rp_satfinite">,
1577+
Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
1578+
1579+
def int_nvvm_bf16x2_to_ue8m0x2_rz : ClangBuiltin<"__nvvm_bf16x2_to_ue8m0x2_rz">,
1580+
Intrinsic<[llvm_i16_ty], [llvm_v2bf16_ty], [IntrNoMem, IntrNoCallback]>;
1581+
def int_nvvm_bf16x2_to_ue8m0x2_rz_satfinite : ClangBuiltin<"__nvvm_bf16x2_to_ue8m0x2_rz_satfinite">,
1582+
Intrinsic<[llvm_i16_ty], [llvm_v2bf16_ty], [IntrNoMem, IntrNoCallback]>;
1583+
def int_nvvm_bf16x2_to_ue8m0x2_rp : ClangBuiltin<"__nvvm_bf16x2_to_ue8m0x2_rp">,
1584+
Intrinsic<[llvm_i16_ty], [llvm_v2bf16_ty], [IntrNoMem, IntrNoCallback]>;
1585+
def int_nvvm_bf16x2_to_ue8m0x2_rp_satfinite : ClangBuiltin<"__nvvm_bf16x2_to_ue8m0x2_rp_satfinite">,
1586+
Intrinsic<[llvm_i16_ty], [llvm_v2bf16_ty], [IntrNoMem, IntrNoCallback]>;
1587+
1588+
def int_nvvm_ue8m0x2_to_bf16x2 : ClangBuiltin<"__nvvm_ue8m0x2_to_bf16x2">,
1589+
Intrinsic<[llvm_v2bf16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
15511590

15521591
// FNS
15531592

0 commit comments

Comments
 (0)