|
43 | 43 | // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx86 -DPTX=86 \ |
44 | 44 | // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ |
45 | 45 | // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s |
| 46 | +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_103a -target-feature +ptx87 -DPTX=87 \ |
| 47 | +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ |
| 48 | +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX87_SM103a %s |
| 49 | +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 -DPTX=87 \ |
| 50 | +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ |
| 51 | +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX87_SM100a %s |
46 | 52 | // ### The last run to check with the highest SM and PTX version available |
47 | 53 | // ### to make sure target builtins are still accepted. |
48 | 54 | // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx87 -DPTX=87 \ |
@@ -1203,6 +1209,83 @@ __device__ void nvvm_cvt_sm100a_sm101a_sm120a() { |
1203 | 1209 | // CHECK: ret void |
1204 | 1210 | } |
1205 | 1211 |
|
| 1212 | +__device__ void nvvm_cvt_sm100a_sm103a() { |
| 1213 | +#if (PTX >= 87) && (__CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM103_ALL) |
| 1214 | + |
| 1215 | +// CHECK_PTX87_SM100a: call <2 x half> @llvm.nvvm.ff2f16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1216 | +// CHECK_PTX87_SM103a: call <2 x half> @llvm.nvvm.ff2f16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1217 | + __nvvm_ff2f16x2_rs(1.0f, 1.0f, 0); |
| 1218 | + |
| 1219 | +// CHECK_PTX87_SM100a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1220 | +// CHECK_PTX87_SM103a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1221 | + __nvvm_ff2f16x2_rs_relu(1.0f, 1.0f, 0); |
| 1222 | + |
| 1223 | +// CHECK_PTX87_SM100a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1224 | +// CHECK_PTX87_SM103a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1225 | + __nvvm_ff2f16x2_rs_satfinite(1.0f, 1.0f, 0); |
| 1226 | + |
| 1227 | +// CHECK_PTX87_SM100a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1228 | +// CHECK_PTX87_SM103a: call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1229 | + __nvvm_ff2f16x2_rs_relu_satfinite(1.0f, 1.0f, 0); |
| 1230 | + |
| 1231 | +// CHECK_PTX87_SM100a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1232 | +// CHECK_PTX87_SM103a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1233 | + __nvvm_ff2bf16x2_rs(1.0f, 1.0f, 0); |
| 1234 | + |
| 1235 | +// CHECK_PTX87_SM100a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1236 | +// CHECK_PTX87_SM103a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1237 | + __nvvm_ff2bf16x2_rs_relu(1.0f, 1.0f, 0); |
| 1238 | + |
| 1239 | +// CHECK_PTX87_SM100a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1240 | +// CHECK_PTX87_SM103a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1241 | + __nvvm_ff2bf16x2_rs_satfinite(1.0f, 1.0f, 0); |
| 1242 | + |
| 1243 | +// CHECK_PTX87_SM100a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1244 | +// CHECK_PTX87_SM103a: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1245 | + __nvvm_ff2bf16x2_rs_relu_satfinite(1.0f, 1.0f, 0); |
| 1246 | + |
| 1247 | +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e4m3x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1248 | +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e4m3x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1249 | + __nvvm_ff_to_e4m3x4_rs_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); |
| 1250 | + |
| 1251 | +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e4m3x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1252 | +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e4m3x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1253 | + __nvvm_ff_to_e4m3x4_rs_relu_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); |
| 1254 | + |
| 1255 | +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e5m2x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1256 | +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e5m2x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1257 | + __nvvm_ff_to_e5m2x4_rs_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); |
| 1258 | + |
| 1259 | +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e5m2x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1260 | +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e5m2x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1261 | + __nvvm_ff_to_e5m2x4_rs_relu_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); |
| 1262 | + |
| 1263 | +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e2m3x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1264 | +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e2m3x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1265 | + __nvvm_ff_to_e2m3x4_rs_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); |
| 1266 | + |
| 1267 | +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e2m3x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1268 | +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e2m3x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1269 | + __nvvm_ff_to_e2m3x4_rs_relu_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); |
| 1270 | + |
| 1271 | +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e3m2x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1272 | +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e3m2x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1273 | + __nvvm_ff_to_e3m2x4_rs_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); |
| 1274 | + |
| 1275 | +// CHECK_PTX87_SM100a: call <4 x i8> @llvm.nvvm.ff.to.e3m2x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1276 | +// CHECK_PTX87_SM103a: call <4 x i8> @llvm.nvvm.ff.to.e3m2x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1277 | + __nvvm_ff_to_e3m2x4_rs_relu_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); |
| 1278 | + |
| 1279 | +// CHECK_PTX87_SM100a: call i16 @llvm.nvvm.ff.to.e2m1x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1280 | +// CHECK_PTX87_SM103a: call i16 @llvm.nvvm.ff.to.e2m1x4.rs.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1281 | + __nvvm_ff_to_e2m1x4_rs_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); |
| 1282 | + |
| 1283 | +// CHECK_PTX87_SM100a: call i16 @llvm.nvvm.ff.to.e2m1x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1284 | +// CHECK_PTX87_SM103a: call i16 @llvm.nvvm.ff.to.e2m1x4.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1285 | + __nvvm_ff_to_e2m1x4_rs_relu_satfinite(1.0f, 1.0f, 1.0f, 1.0f, 0); |
| 1286 | +#endif |
| 1287 | +} |
| 1288 | + |
1206 | 1289 | #define NAN32 0x7FBFFFFF |
1207 | 1290 | #define NAN16 (__bf16)0x7FBF |
1208 | 1291 | #define BF16 (__bf16)0.1f |
|
0 commit comments