2525// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
2626// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
2727// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
28- // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \
28+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 -DPTX=81 \
2929// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
3030// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
31+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_90 -target-feature +ptx78 -DPTX=78 \
32+ // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
33+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX78_SM90 %s
34+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100 -target-feature +ptx86 -DPTX=86 \
35+ // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
36+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100 %s
37+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx86 -DPTX=86 \
38+ // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
39+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100a %s
40+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_101a -target-feature +ptx86 -DPTX=86 \
41+ // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
42+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM101a %s
43+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx86 -DPTX=86 \
44+ // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
45+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s
3146// ### The last run to check with the highest SM and PTX version available
3247// ### to make sure target builtins are still accepted.
33- // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 \
48+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx87 -DPTX=87 \
3449// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
35- // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
50+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s
3651
3752#define __device__ __attribute__((device))
3853#define __global__ __attribute__((global))
@@ -994,7 +1009,7 @@ __device__ void nvvm_cvt_sm80() {
9941009
9951010// CHECK-LABEL: nvvm_cvt_sm89
9961011__device__ void nvvm_cvt_sm89 () {
997- #if __CUDA_ARCH__ >= 890
1012+ #if ( PTX >= 81 ) && ( __CUDA_ARCH__ >= 890 )
9981013 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.000000e+00, float 1.000000e+00)
9991014 __nvvm_ff_to_e4m3x2_rn (1.0f , 1.0f );
10001015 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
@@ -1021,6 +1036,174 @@ __device__ void nvvm_cvt_sm89() {
10211036 __nvvm_e5m2x2_to_f16x2_rn (0x4c4c );
10221037 // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532)
10231038 __nvvm_e5m2x2_to_f16x2_rn_relu (0x4c4c );
1039+
1040+ // CHECK_PTX81_SM89: call i32 @llvm.nvvm.f2tf32.rna.satfinite(float 1.000000e+00)
1041+ __nvvm_f2tf32_rna_satfinite (1.0f );
1042+ #endif
1043+ // CHECK: ret void
1044+ }
1045+
1046+ // CHECK-LABEL: nvvm_cvt_sm90
1047+ __device__ void nvvm_cvt_sm90 () {
1048+ #if (PTX >= 78 ) && (__CUDA_ARCH__ >= 900 )
1049+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn(float 1.000000e+00)
1050+ __nvvm_f2tf32_rn (1.0f );
1051+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn.relu(float 1.000000e+00)
1052+ __nvvm_f2tf32_rn_relu (1.0f );
1053+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz(float 1.000000e+00)
1054+ __nvvm_f2tf32_rz (1.0f );
1055+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz.relu(float 1.000000e+00)
1056+ __nvvm_f2tf32_rz_relu (1.0f );
1057+ #endif
1058+ // CHECK: ret void
1059+ }
1060+
1061+ // CHECK-LABEL: nvvm_cvt_sm100
1062+ __device__ void nvvm_cvt_sm100 () {
1063+ #if (PTX >= 86 ) && (__CUDA_ARCH__ >= 1000 )
1064+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.satfinite(float 1.000000e+00)
1065+ __nvvm_f2tf32_rn_satfinite (1.0f );
1066+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float 1.000000e+00)
1067+ __nvvm_f2tf32_rn_relu_satfinite (1.0f );
1068+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.satfinite(float 1.000000e+00)
1069+ __nvvm_f2tf32_rz_satfinite (1.0f );
1070+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float 1.000000e+00)
1071+ __nvvm_f2tf32_rz_relu_satfinite (1.0f );
1072+ #endif
1073+ // CHECK: ret void
1074+ }
1075+
1076+ // CHECK-LABEL: nvvm_cvt_sm100a
1077+ __device__ void nvvm_cvt_sm100a () {
1078+ #if (PTX >= 86 ) && __CUDA_ARCH_FEAT_SM100_ALL
1079+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn(float 1.000000e+00, float 1.000000e+00)
1080+ __nvvm_ff_to_e2m3x2_rn (1.0f , 1.0f );
1081+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1082+ __nvvm_ff_to_e2m3x2_rn_relu (1.0f , 1.0f );
1083+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn(float 1.000000e+00, float 1.000000e+00)
1084+ __nvvm_ff_to_e3m2x2_rn (1.0f , 1.0f );
1085+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1086+ __nvvm_ff_to_e3m2x2_rn_relu (1.0f , 1.0f );
1087+
1088+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
1089+ __nvvm_e2m3x2_to_f16x2_rn (0x4C4C );
1090+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
1091+ __nvvm_e2m3x2_to_f16x2_rn_relu (0x4848 );
1092+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
1093+ __nvvm_e3m2x2_to_f16x2_rn (0x4848 );
1094+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
1095+ __nvvm_e3m2x2_to_f16x2_rn_relu (0x4C4C );
1096+
1097+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
1098+ __nvvm_ff_to_ue8m0x2_rz (1.0f , 1.0f );
1099+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
1100+ __nvvm_ff_to_ue8m0x2_rz_satfinite (1.0f , 1.0f );
1101+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
1102+ __nvvm_ff_to_ue8m0x2_rp (1.0f , 1.0f );
1103+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
1104+ __nvvm_ff_to_ue8m0x2_rp_satfinite (1.0f , 1.0f );
1105+
1106+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
1107+ __nvvm_bf16x2_to_ue8m0x2_rz ({(__bf16 )0.1f , (__bf16 )0.1f });
1108+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1109+ __nvvm_bf16x2_to_ue8m0x2_rz_satfinite ({(__bf16 )0.1f , (__bf16 )0.1f });
1110+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
1111+ __nvvm_bf16x2_to_ue8m0x2_rp ({(__bf16 )0.1f , (__bf16 )0.1f });
1112+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1113+ __nvvm_bf16x2_to_ue8m0x2_rp_satfinite ({(__bf16 )0.1f , (__bf16 )0.1f });
1114+
1115+ // CHECK_PTX86_SM100a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
1116+ __nvvm_ue8m0x2_to_bf16x2 (0x4C4C );
1117+ #endif
1118+ // CHECK: ret void
1119+ }
1120+
1121+ // CHECK-LABEL: nvvm_cvt_sm101a
1122+ __device__ void nvvm_cvt_sm101a () {
1123+ #if (PTX >= 86 ) && __CUDA_ARCH_FEAT_SM101_ALL
1124+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn(float 1.000000e+00, float 1.000000e+00)
1125+ __nvvm_ff_to_e2m3x2_rn (1.0f , 1.0f );
1126+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1127+ __nvvm_ff_to_e2m3x2_rn_relu (1.0f , 1.0f );
1128+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn(float 1.000000e+00, float 1.000000e+00)
1129+ __nvvm_ff_to_e3m2x2_rn (1.0f , 1.0f );
1130+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1131+ __nvvm_ff_to_e3m2x2_rn_relu (1.0f , 1.0f );
1132+
1133+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
1134+ __nvvm_e2m3x2_to_f16x2_rn (0x4C4C );
1135+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
1136+ __nvvm_e2m3x2_to_f16x2_rn_relu (0x4848 );
1137+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
1138+ __nvvm_e3m2x2_to_f16x2_rn (0x4848 );
1139+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
1140+ __nvvm_e3m2x2_to_f16x2_rn_relu (0x4C4C );
1141+
1142+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
1143+ __nvvm_ff_to_ue8m0x2_rz (1.0f , 1.0f );
1144+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
1145+ __nvvm_ff_to_ue8m0x2_rz_satfinite (1.0f , 1.0f );
1146+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
1147+ __nvvm_ff_to_ue8m0x2_rp (1.0f , 1.0f );
1148+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
1149+ __nvvm_ff_to_ue8m0x2_rp_satfinite (1.0f , 1.0f );
1150+
1151+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
1152+ __nvvm_bf16x2_to_ue8m0x2_rz ({(__bf16 )0.1f , (__bf16 )0.1f });
1153+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1154+ __nvvm_bf16x2_to_ue8m0x2_rz_satfinite ({(__bf16 )0.1f , (__bf16 )0.1f });
1155+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
1156+ __nvvm_bf16x2_to_ue8m0x2_rp ({(__bf16 )0.1f , (__bf16 )0.1f });
1157+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1158+ __nvvm_bf16x2_to_ue8m0x2_rp_satfinite ({(__bf16 )0.1f , (__bf16 )0.1f });
1159+
1160+ // CHECK_PTX86_SM101a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
1161+ __nvvm_ue8m0x2_to_bf16x2 (0x4C4C );
1162+ #endif
1163+ // CHECK: ret void
1164+ }
1165+
1166+ // CHECK-LABEL: nvvm_cvt_sm120a
1167+ __device__ void nvvm_cvt_sm120a () {
1168+ #if (PTX >= 86 ) && __CUDA_ARCH_FEAT_SM120_ALL
1169+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn(float 1.000000e+00, float 1.000000e+00)
1170+ __nvvm_ff_to_e2m3x2_rn (1.0f , 1.0f );
1171+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1172+ __nvvm_ff_to_e2m3x2_rn_relu (1.0f , 1.0f );
1173+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn(float 1.000000e+00, float 1.000000e+00)
1174+ __nvvm_ff_to_e3m2x2_rn (1.0f , 1.0f );
1175+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1176+ __nvvm_ff_to_e3m2x2_rn_relu (1.0f , 1.0f );
1177+
1178+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
1179+ __nvvm_e2m3x2_to_f16x2_rn (0x4C4C );
1180+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
1181+ __nvvm_e2m3x2_to_f16x2_rn_relu (0x4848 );
1182+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
1183+ __nvvm_e3m2x2_to_f16x2_rn (0x4848 );
1184+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
1185+ __nvvm_e3m2x2_to_f16x2_rn_relu (0x4C4C );
1186+
1187+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
1188+ __nvvm_ff_to_ue8m0x2_rz (1.0f , 1.0f );
1189+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
1190+ __nvvm_ff_to_ue8m0x2_rz_satfinite (1.0f , 1.0f );
1191+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
1192+ __nvvm_ff_to_ue8m0x2_rp (1.0f , 1.0f );
1193+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
1194+ __nvvm_ff_to_ue8m0x2_rp_satfinite (1.0f , 1.0f );
1195+
1196+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
1197+ __nvvm_bf16x2_to_ue8m0x2_rz ({(__bf16 )0.1f , (__bf16 )0.1f });
1198+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1199+ __nvvm_bf16x2_to_ue8m0x2_rz_satfinite ({(__bf16 )0.1f , (__bf16 )0.1f });
1200+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
1201+ __nvvm_bf16x2_to_ue8m0x2_rp ({(__bf16 )0.1f , (__bf16 )0.1f });
1202+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1203+ __nvvm_bf16x2_to_ue8m0x2_rp_satfinite ({(__bf16 )0.1f , (__bf16 )0.1f });
1204+
1205+ // CHECK_PTX86_SM120a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
1206+ __nvvm_ue8m0x2_to_bf16x2 (0x4C4C );
10241207#endif
10251208 // CHECK: ret void
10261209}
0 commit comments