2525// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
2626// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
2727// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
28- // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \
28+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 -DPTX=81 \
2929// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
3030// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
31+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_90 -target-feature +ptx78 -DPTX=78 \
32+ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
33+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX78_SM90 %s
34+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100 -target-feature +ptx86 -DPTX=86 \
35+ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
36+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100 %s
37+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx86 -DPTX=86 \
38+ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
39+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100a %s
40+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_101a -target-feature +ptx86 -DPTX=86 \
41+ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
42+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM101a %s
43+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx86 -DPTX=86 \
44+ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
45+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s
3146// ### The last run to check with the highest SM and PTX version available
3247// ### to make sure target builtins are still accepted.
33- // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 \
48+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx87 -DPTX=87 \
3449// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
35- // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
50+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s
3651
3752#define __device__ __attribute__((device))
3853#define __global__ __attribute__((global))
@@ -995,7 +1010,7 @@ __device__ void nvvm_cvt_sm80() {
9951010
9961011// CHECK-LABEL: nvvm_cvt_sm89
9971012__device__ void nvvm_cvt_sm89 () {
998- #if __CUDA_ARCH__ >= 890
1013+ #if ( PTX >= 81 ) && ( __CUDA_ARCH__ >= 890 )
9991014 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.000000e+00, float 1.000000e+00)
10001015 __nvvm_ff_to_e4m3x2_rn (1.0f , 1.0f );
10011016 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
@@ -1022,6 +1037,133 @@ __device__ void nvvm_cvt_sm89() {
10221037 __nvvm_e5m2x2_to_f16x2_rn (0x4c4c );
10231038 // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532)
10241039 __nvvm_e5m2x2_to_f16x2_rn_relu (0x4c4c );
1040+
1041+ // CHECK_PTX81_SM89: call i32 @llvm.nvvm.f2tf32.rna.satfinite(float 1.000000e+00)
1042+ __nvvm_f2tf32_rna_satfinite (1.0f );
1043+ #endif
1044+ // CHECK: ret void
1045+ }
1046+
1047+ // CHECK-LABEL: nvvm_cvt_sm90
1048+ __device__ void nvvm_cvt_sm90 () {
1049+ #if (PTX >= 78 ) && (__CUDA_ARCH__ >= 900 )
1050+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn(float 1.000000e+00)
1051+ __nvvm_f2tf32_rn (1.0f );
1052+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn.relu(float 1.000000e+00)
1053+ __nvvm_f2tf32_rn_relu (1.0f );
1054+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz(float 1.000000e+00)
1055+ __nvvm_f2tf32_rz (1.0f );
1056+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz.relu(float 1.000000e+00)
1057+ __nvvm_f2tf32_rz_relu (1.0f );
1058+ #endif
1059+ // CHECK: ret void
1060+ }
1061+
1062+ // CHECK-LABEL: nvvm_cvt_sm100
1063+ __device__ void nvvm_cvt_sm100 () {
1064+ #if (PTX >= 86 ) && (__CUDA_ARCH__ >= 1000 )
1065+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.satfinite(float 1.000000e+00)
1066+ __nvvm_f2tf32_rn_satfinite (1.0f );
1067+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float 1.000000e+00)
1068+ __nvvm_f2tf32_rn_relu_satfinite (1.0f );
1069+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.satfinite(float 1.000000e+00)
1070+ __nvvm_f2tf32_rz_satfinite (1.0f );
1071+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float 1.000000e+00)
1072+ __nvvm_f2tf32_rz_relu_satfinite (1.0f );
1073+ #endif
1074+ // CHECK: ret void
1075+ }
1076+
1077+ // CHECK-LABEL: nvvm_cvt_sm100a_sm101a_sm120a
1078+ __device__ void nvvm_cvt_sm100a_sm101a_sm120a () {
1079+ #if (PTX >= 86 ) && \
1080+ (__CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL || \
1081+ __CUDA_ARCH_FEAT_SM120_ALL )
1082+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00)
1083+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00)
1084+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00)
1085+ __nvvm_ff_to_e2m3x2_rn_satfinite (1.0f , 1.0f );
1086+
1087+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
1088+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
1089+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
1090+ __nvvm_ff_to_e2m3x2_rn_relu_satfinite (1.0f , 1.0f );
1091+
1092+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00)
1093+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00)
1094+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00)
1095+ __nvvm_ff_to_e3m2x2_rn_satfinite (1.0f , 1.0f );
1096+
1097+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
1098+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
1099+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
1100+ __nvvm_ff_to_e3m2x2_rn_relu_satfinite (1.0f , 1.0f );
1101+
1102+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
1103+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
1104+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
1105+ __nvvm_e2m3x2_to_f16x2_rn (0x4C4C );
1106+
1107+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
1108+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
1109+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
1110+ __nvvm_e2m3x2_to_f16x2_rn_relu (0x4848 );
1111+
1112+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
1113+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
1114+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
1115+ __nvvm_e3m2x2_to_f16x2_rn (0x4848 );
1116+
1117+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
1118+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
1119+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
1120+ __nvvm_e3m2x2_to_f16x2_rn_relu (0x4C4C );
1121+
1122+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
1123+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
1124+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
1125+ __nvvm_ff_to_ue8m0x2_rz (1.0f , 1.0f );
1126+
1127+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
1128+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
1129+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
1130+ __nvvm_ff_to_ue8m0x2_rz_satfinite (1.0f , 1.0f );
1131+
1132+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
1133+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
1134+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
1135+ __nvvm_ff_to_ue8m0x2_rp (1.0f , 1.0f );
1136+
1137+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
1138+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
1139+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
1140+ __nvvm_ff_to_ue8m0x2_rp_satfinite (1.0f , 1.0f );
1141+
1142+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
1143+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
1144+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
1145+ __nvvm_bf16x2_to_ue8m0x2_rz ({(__bf16 )0.1f , (__bf16 )0.1f });
1146+
1147+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1148+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1149+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1150+ __nvvm_bf16x2_to_ue8m0x2_rz_satfinite ({(__bf16 )0.1f , (__bf16 )0.1f });
1151+
1152+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
1153+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
1154+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
1155+ __nvvm_bf16x2_to_ue8m0x2_rp ({(__bf16 )0.1f , (__bf16 )0.1f });
1156+
1157+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1158+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1159+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1160+ __nvvm_bf16x2_to_ue8m0x2_rp_satfinite ({(__bf16 )0.1f , (__bf16 )0.1f });
1161+
1162+ // CHECK_PTX86_SM100a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
1163+ // CHECK_PTX86_SM101a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
1164+ // CHECK_PTX86_SM120a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
1165+ __nvvm_ue8m0x2_to_bf16x2 (0x4C4C );
1166+
10251167#endif
10261168 // CHECK: ret void
10271169}
0 commit comments