2525// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
2626// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
2727// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
28- // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \
28+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 -DPTX=81 \
2929// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
3030// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
31+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_90 -target-feature +ptx78 -DPTX=78 \
32+ // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
33+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX78_SM90 %s
34+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100 -target-feature +ptx86 -DPTX=86 \
35+ // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
36+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100 %s
37+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx86 -DPTX=86 \
38+ // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
39+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100a %s
40+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_101a -target-feature +ptx86 -DPTX=86 \
41+ // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
42+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM101a %s
43+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx86 -DPTX=86 \
44+ // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
45+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s
3146// ### The last run to check with the highest SM and PTX version available
3247// ### to make sure target builtins are still accepted.
33- // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 \
48+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx87 -DPTX=87 \
3449// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
35- // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
50+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s
3651
3752#define __device__ __attribute__((device))
3853#define __global__ __attribute__((global))
@@ -994,7 +1009,7 @@ __device__ void nvvm_cvt_sm80() {
9941009
9951010// CHECK-LABEL: nvvm_cvt_sm89
9961011__device__ void nvvm_cvt_sm89 () {
997- #if __CUDA_ARCH__ >= 890
1012+ #if ( PTX >= 81 ) && ( __CUDA_ARCH__ >= 890 )
9981013 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.000000e+00, float 1.000000e+00)
9991014 __nvvm_ff_to_e4m3x2_rn (1.0f , 1.0f );
10001015 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
@@ -1021,6 +1036,133 @@ __device__ void nvvm_cvt_sm89() {
10211036 __nvvm_e5m2x2_to_f16x2_rn (0x4c4c );
10221037 // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532)
10231038 __nvvm_e5m2x2_to_f16x2_rn_relu (0x4c4c );
1039+
1040+ // CHECK_PTX81_SM89: call i32 @llvm.nvvm.f2tf32.rna.satfinite(float 1.000000e+00)
1041+ __nvvm_f2tf32_rna_satfinite (1.0f );
1042+ #endif
1043+ // CHECK: ret void
1044+ }
1045+
1046+ // CHECK-LABEL: nvvm_cvt_sm90
1047+ __device__ void nvvm_cvt_sm90 () {
1048+ #if (PTX >= 78 ) && (__CUDA_ARCH__ >= 900 )
1049+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn(float 1.000000e+00)
1050+ __nvvm_f2tf32_rn (1.0f );
1051+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn.relu(float 1.000000e+00)
1052+ __nvvm_f2tf32_rn_relu (1.0f );
1053+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz(float 1.000000e+00)
1054+ __nvvm_f2tf32_rz (1.0f );
1055+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz.relu(float 1.000000e+00)
1056+ __nvvm_f2tf32_rz_relu (1.0f );
1057+ #endif
1058+ // CHECK: ret void
1059+ }
1060+
1061+ // CHECK-LABEL: nvvm_cvt_sm100
1062+ __device__ void nvvm_cvt_sm100 () {
1063+ #if (PTX >= 86 ) && (__CUDA_ARCH__ >= 1000 )
1064+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.satfinite(float 1.000000e+00)
1065+ __nvvm_f2tf32_rn_satfinite (1.0f );
1066+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float 1.000000e+00)
1067+ __nvvm_f2tf32_rn_relu_satfinite (1.0f );
1068+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.satfinite(float 1.000000e+00)
1069+ __nvvm_f2tf32_rz_satfinite (1.0f );
1070+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float 1.000000e+00)
1071+ __nvvm_f2tf32_rz_relu_satfinite (1.0f );
1072+ #endif
1073+ // CHECK: ret void
1074+ }
1075+
1076+ // CHECK-LABEL: nvvm_cvt_sm100a_sm101a_sm120a
1077+ __device__ void nvvm_cvt_sm100a_sm101a_sm120a () {
1078+ #if (PTX >= 86 ) && \
1079+ (__CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL || \
1080+ __CUDA_ARCH_FEAT_SM120_ALL )
1081+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn(float 1.000000e+00, float 1.000000e+00)
1082+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn(float 1.000000e+00, float 1.000000e+00)
1083+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn(float 1.000000e+00, float 1.000000e+00)
1084+ __nvvm_ff_to_e2m3x2_rn (1.0f , 1.0f );
1085+
1086+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1087+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1088+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1089+ __nvvm_ff_to_e2m3x2_rn_relu (1.0f , 1.0f );
1090+
1091+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn(float 1.000000e+00, float 1.000000e+00)
1092+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn(float 1.000000e+00, float 1.000000e+00)
1093+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn(float 1.000000e+00, float 1.000000e+00)
1094+ __nvvm_ff_to_e3m2x2_rn (1.0f , 1.0f );
1095+
1096+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1097+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1098+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1099+ __nvvm_ff_to_e3m2x2_rn_relu (1.0f , 1.0f );
1100+
1101+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
1102+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
1103+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
1104+ __nvvm_e2m3x2_to_f16x2_rn (0x4C4C );
1105+
1106+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
1107+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
1108+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
1109+ __nvvm_e2m3x2_to_f16x2_rn_relu (0x4848 );
1110+
1111+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
1112+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
1113+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
1114+ __nvvm_e3m2x2_to_f16x2_rn (0x4848 );
1115+
1116+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
1117+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
1118+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
1119+ __nvvm_e3m2x2_to_f16x2_rn_relu (0x4C4C );
1120+
1121+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
1122+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
1123+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
1124+ __nvvm_ff_to_ue8m0x2_rz (1.0f , 1.0f );
1125+
1126+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
1127+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
1128+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
1129+ __nvvm_ff_to_ue8m0x2_rz_satfinite (1.0f , 1.0f );
1130+
1131+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
1132+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
1133+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
1134+ __nvvm_ff_to_ue8m0x2_rp (1.0f , 1.0f );
1135+
1136+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
1137+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
1138+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
1139+ __nvvm_ff_to_ue8m0x2_rp_satfinite (1.0f , 1.0f );
1140+
1141+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
1142+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
1143+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
1144+ __nvvm_bf16x2_to_ue8m0x2_rz ({(__bf16 )0.1f , (__bf16 )0.1f });
1145+
1146+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1147+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1148+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1149+ __nvvm_bf16x2_to_ue8m0x2_rz_satfinite ({(__bf16 )0.1f , (__bf16 )0.1f });
1150+
1151+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
1152+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
1153+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
1154+ __nvvm_bf16x2_to_ue8m0x2_rp ({(__bf16 )0.1f , (__bf16 )0.1f });
1155+
1156+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1157+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1158+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1159+ __nvvm_bf16x2_to_ue8m0x2_rp_satfinite ({(__bf16 )0.1f , (__bf16 )0.1f });
1160+
1161+ // CHECK_PTX86_SM100a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
1162+ // CHECK_PTX86_SM101a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
1163+ // CHECK_PTX86_SM120a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
1164+ __nvvm_ue8m0x2_to_bf16x2 (0x4C4C );
1165+
10241166#endif
10251167 // CHECK: ret void
10261168}
0 commit comments