25
25
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
26
26
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
27
27
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
28
- // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \
28
+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 -DPTX=81 \
29
29
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
30
30
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
31
+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_90 -target-feature +ptx78 -DPTX=78 \
32
+ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
33
+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX78_SM90 %s
34
+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100 -target-feature +ptx86 -DPTX=86 \
35
+ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
36
+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100 %s
37
+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx86 -DPTX=86 \
38
+ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
39
+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100a %s
40
+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_101a -target-feature +ptx86 -DPTX=86 \
41
+ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
42
+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM101a %s
43
+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx86 -DPTX=86 \
44
+ // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
45
+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s
31
46
// ### The last run to check with the highest SM and PTX version available
32
47
// ### to make sure target builtins are still accepted.
33
- // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 \
48
+ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx87 -DPTX=87 \
34
49
// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
35
- // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
50
+ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s
36
51
37
52
#define __device__ __attribute__((device))
38
53
#define __global__ __attribute__((global))
@@ -995,7 +1010,7 @@ __device__ void nvvm_cvt_sm80() {
995
1010
996
1011
// CHECK-LABEL: nvvm_cvt_sm89
997
1012
__device__ void nvvm_cvt_sm89 () {
998
- #if __CUDA_ARCH__ >= 890
1013
+ #if ( PTX >= 81 ) && ( __CUDA_ARCH__ >= 890 )
999
1014
// CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.000000e+00, float 1.000000e+00)
1000
1015
__nvvm_ff_to_e4m3x2_rn (1.0f , 1.0f );
1001
1016
// CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
@@ -1022,6 +1037,133 @@ __device__ void nvvm_cvt_sm89() {
1022
1037
__nvvm_e5m2x2_to_f16x2_rn (0x4c4c );
1023
1038
// CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532)
1024
1039
__nvvm_e5m2x2_to_f16x2_rn_relu (0x4c4c );
1040
+
1041
+ // CHECK_PTX81_SM89: call i32 @llvm.nvvm.f2tf32.rna.satfinite(float 1.000000e+00)
1042
+ __nvvm_f2tf32_rna_satfinite (1.0f );
1043
+ #endif
1044
+ // CHECK: ret void
1045
+ }
1046
+
1047
+ // CHECK-LABEL: nvvm_cvt_sm90
1048
+ __device__ void nvvm_cvt_sm90 () {
1049
+ #if (PTX >= 78 ) && (__CUDA_ARCH__ >= 900 )
1050
+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn(float 1.000000e+00)
1051
+ __nvvm_f2tf32_rn (1.0f );
1052
+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn.relu(float 1.000000e+00)
1053
+ __nvvm_f2tf32_rn_relu (1.0f );
1054
+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz(float 1.000000e+00)
1055
+ __nvvm_f2tf32_rz (1.0f );
1056
+ // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz.relu(float 1.000000e+00)
1057
+ __nvvm_f2tf32_rz_relu (1.0f );
1058
+ #endif
1059
+ // CHECK: ret void
1060
+ }
1061
+
1062
+ // CHECK-LABEL: nvvm_cvt_sm100
1063
+ __device__ void nvvm_cvt_sm100 () {
1064
+ #if (PTX >= 86 ) && (__CUDA_ARCH__ >= 1000 )
1065
+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.satfinite(float 1.000000e+00)
1066
+ __nvvm_f2tf32_rn_satfinite (1.0f );
1067
+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float 1.000000e+00)
1068
+ __nvvm_f2tf32_rn_relu_satfinite (1.0f );
1069
+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.satfinite(float 1.000000e+00)
1070
+ __nvvm_f2tf32_rz_satfinite (1.0f );
1071
+ // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float 1.000000e+00)
1072
+ __nvvm_f2tf32_rz_relu_satfinite (1.0f );
1073
+ #endif
1074
+ // CHECK: ret void
1075
+ }
1076
+
1077
+ // CHECK-LABEL: nvvm_cvt_sm100a_sm101a_sm120a
1078
+ __device__ void nvvm_cvt_sm100a_sm101a_sm120a () {
1079
+ #if (PTX >= 86 ) && \
1080
+ (__CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM101_ALL || \
1081
+ __CUDA_ARCH_FEAT_SM120_ALL )
1082
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00)
1083
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00)
1084
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00)
1085
+ __nvvm_ff_to_e2m3x2_rn_satfinite (1.0f , 1.0f );
1086
+
1087
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
1088
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
1089
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
1090
+ __nvvm_ff_to_e2m3x2_rn_relu_satfinite (1.0f , 1.0f );
1091
+
1092
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00)
1093
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00)
1094
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00)
1095
+ __nvvm_ff_to_e3m2x2_rn_satfinite (1.0f , 1.0f );
1096
+
1097
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
1098
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
1099
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00)
1100
+ __nvvm_ff_to_e3m2x2_rn_relu_satfinite (1.0f , 1.0f );
1101
+
1102
+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
1103
+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
1104
+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532)
1105
+ __nvvm_e2m3x2_to_f16x2_rn (0x4C4C );
1106
+
1107
+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
1108
+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
1109
+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504)
1110
+ __nvvm_e2m3x2_to_f16x2_rn_relu (0x4848 );
1111
+
1112
+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
1113
+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
1114
+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504)
1115
+ __nvvm_e3m2x2_to_f16x2_rn (0x4848 );
1116
+
1117
+ // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
1118
+ // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
1119
+ // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
1120
+ __nvvm_e3m2x2_to_f16x2_rn_relu (0x4C4C );
1121
+
1122
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
1123
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
1124
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00)
1125
+ __nvvm_ff_to_ue8m0x2_rz (1.0f , 1.0f );
1126
+
1127
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
1128
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
1129
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00)
1130
+ __nvvm_ff_to_ue8m0x2_rz_satfinite (1.0f , 1.0f );
1131
+
1132
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
1133
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
1134
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00)
1135
+ __nvvm_ff_to_ue8m0x2_rp (1.0f , 1.0f );
1136
+
1137
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
1138
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
1139
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00)
1140
+ __nvvm_ff_to_ue8m0x2_rp_satfinite (1.0f , 1.0f );
1141
+
1142
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
1143
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
1144
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD)
1145
+ __nvvm_bf16x2_to_ue8m0x2_rz ({(__bf16 )0.1f , (__bf16 )0.1f });
1146
+
1147
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1148
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1149
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1150
+ __nvvm_bf16x2_to_ue8m0x2_rz_satfinite ({(__bf16 )0.1f , (__bf16 )0.1f });
1151
+
1152
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
1153
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
1154
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD)
1155
+ __nvvm_bf16x2_to_ue8m0x2_rp ({(__bf16 )0.1f , (__bf16 )0.1f });
1156
+
1157
+ // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1158
+ // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1159
+ // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD)
1160
+ __nvvm_bf16x2_to_ue8m0x2_rp_satfinite ({(__bf16 )0.1f , (__bf16 )0.1f });
1161
+
1162
+ // CHECK_PTX86_SM100a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
1163
+ // CHECK_PTX86_SM101a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
1164
+ // CHECK_PTX86_SM120a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532)
1165
+ __nvvm_ue8m0x2_to_bf16x2 (0x4C4C );
1166
+
1025
1167
#endif
1026
1168
// CHECK: ret void
1027
1169
}
0 commit comments