|
43 | 43 | // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx86 -DPTX=86 \
|
44 | 44 | // RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
|
45 | 45 | // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s
|
| 46 | +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_103a -target-feature +ptx87 -DPTX=87 \ |
| 47 | +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ |
| 48 | +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX87_SM103a %s |
| 49 | +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 -DPTX=87 \ |
| 50 | +// RUN: -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \ |
| 51 | +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX87_SM100a %s |
46 | 52 | // ### The last run to check with the highest SM and PTX version available
|
47 | 53 | // ### to make sure target builtins are still accepted.
|
48 | 54 | // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx87 -DPTX=87 \
|
@@ -1203,6 +1209,123 @@ __device__ void nvvm_cvt_sm100a_sm101a_sm120a() {
|
1203 | 1209 | // CHECK: ret void
|
1204 | 1210 | }
|
1205 | 1211 |
|
| 1212 | +__device__ void nvvm_cvt_sm100a_sm103a() { |
| 1213 | +#if (PTX >= 87) && (__CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM103_ALL) |
| 1214 | + |
| 1215 | + typedef __fp16 f16x2 __attribute__((ext_vector_type(2))); |
| 1216 | + typedef __bf16 bf16x2 __attribute__((ext_vector_type(2))); |
| 1217 | + typedef char uint8x4 __attribute__((ext_vector_type(4))); |
| 1218 | + |
| 1219 | +// CHECK_PTX87_SM100a: %[[R1:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1220 | +// CHECK_PTX87_SM100a: store <2 x half> %[[R1]], ptr %r1 |
| 1221 | +// CHECK_PTX87_SM103a: %[[R1:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1222 | +// CHECK_PTX87_SM103a: store <2 x half> %[[R1]], ptr %r1 |
| 1223 | + f16x2 r1 = __nvvm_ff2f16x2_rs(1.0f, 1.0f, 0); |
| 1224 | + |
| 1225 | +// CHECK_PTX87_SM100a: %[[R2:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1226 | +// CHECK_PTX87_SM100a: store <2 x half> %[[R2]], ptr %r2 |
| 1227 | +// CHECK_PTX87_SM103a: %[[R2:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1228 | +// CHECK_PTX87_SM103a: store <2 x half> %[[R2]], ptr %r2 |
| 1229 | + f16x2 r2 = __nvvm_ff2f16x2_rs_relu(1.0f, 1.0f, 0); |
| 1230 | + |
| 1231 | +// CHECK_PTX87_SM100a: %[[R3:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1232 | +// CHECK_PTX87_SM100a: store <2 x half> %[[R3]], ptr %r3 |
| 1233 | +// CHECK_PTX87_SM103a: %[[R3:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1234 | +// CHECK_PTX87_SM103a: store <2 x half> %[[R3]], ptr %r3 |
| 1235 | + f16x2 r3 = __nvvm_ff2f16x2_rs_satfinite(1.0f, 1.0f, 0); |
| 1236 | + |
| 1237 | +// CHECK_PTX87_SM100a: %[[R4:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1238 | +// CHECK_PTX87_SM100a: store <2 x half> %[[R4]], ptr %r4 |
| 1239 | +// CHECK_PTX87_SM103a: %[[R4:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1240 | +// CHECK_PTX87_SM103a: store <2 x half> %[[R4]], ptr %r4 |
| 1241 | + f16x2 r4 = __nvvm_ff2f16x2_rs_relu_satfinite(1.0f, 1.0f, 0); |
| 1242 | + |
| 1243 | +// CHECK_PTX87_SM100a: %[[R5:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1244 | +// CHECK_PTX87_SM100a: store <2 x bfloat> %[[R5]], ptr %r5 |
| 1245 | +// CHECK_PTX87_SM103a: %[[R5:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1246 | +// CHECK_PTX87_SM103a: store <2 x bfloat> %[[R5]], ptr %r5 |
| 1247 | + bf16x2 r5 = __nvvm_ff2bf16x2_rs(1.0f, 1.0f, 0); |
| 1248 | + |
| 1249 | +// CHECK_PTX87_SM100a: %[[R6:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1250 | +// CHECK_PTX87_SM100a: store <2 x bfloat> %[[R6]], ptr %r6 |
| 1251 | +// CHECK_PTX87_SM103a: %[[R6:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1252 | +// CHECK_PTX87_SM103a: store <2 x bfloat> %[[R6]], ptr %r6 |
| 1253 | + bf16x2 r6 = __nvvm_ff2bf16x2_rs_relu(1.0f, 1.0f, 0); |
| 1254 | + |
| 1255 | +// CHECK_PTX87_SM100a: %[[R7:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1256 | +// CHECK_PTX87_SM100a: store <2 x bfloat> %[[R7]], ptr %r7 |
| 1257 | +// CHECK_PTX87_SM103a: %[[R7:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1258 | +// CHECK_PTX87_SM103a: store <2 x bfloat> %[[R7]], ptr %r7 |
| 1259 | + bf16x2 r7 = __nvvm_ff2bf16x2_rs_satfinite(1.0f, 1.0f, 0); |
| 1260 | + |
| 1261 | +// CHECK_PTX87_SM100a: %[[R8:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1262 | +// CHECK_PTX87_SM100a: store <2 x bfloat> %[[R8]], ptr %r8 |
| 1263 | +// CHECK_PTX87_SM103a: %[[R8:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0) |
| 1264 | +// CHECK_PTX87_SM103a: store <2 x bfloat> %[[R8]], ptr %r8 |
| 1265 | + bf16x2 r8 = __nvvm_ff2bf16x2_rs_relu_satfinite(1.0f, 1.0f, 0); |
| 1266 | + |
| 1267 | +// CHECK_PTX87_SM100a: %[[R9:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1268 | +// CHECK_PTX87_SM100a: store <4 x i8> %[[R9]], ptr %r9 |
| 1269 | +// CHECK_PTX87_SM103a: %[[R9:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1270 | +// CHECK_PTX87_SM103a: store <4 x i8> %[[R9]], ptr %r9 |
| 1271 | + uint8x4 r9 = __nvvm_f32x4_to_e4m3x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); |
| 1272 | + |
| 1273 | +// CHECK_PTX87_SM100a: %[[R10:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1274 | +// CHECK_PTX87_SM100a: store <4 x i8> %[[R10]], ptr %r10 |
| 1275 | +// CHECK_PTX87_SM103a: %[[R10:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1276 | +// CHECK_PTX87_SM103a: store <4 x i8> %[[R10]], ptr %r10 |
| 1277 | + uint8x4 r10 = __nvvm_f32x4_to_e4m3x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); |
| 1278 | + |
| 1279 | +// CHECK_PTX87_SM100a: %[[R11:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1280 | +// CHECK_PTX87_SM100a: store <4 x i8> %[[R11]], ptr %r11 |
| 1281 | +// CHECK_PTX87_SM103a: %[[R11:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1282 | +// CHECK_PTX87_SM103a: store <4 x i8> %[[R11]], ptr %r11 |
| 1283 | + uint8x4 r11 = __nvvm_f32x4_to_e5m2x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); |
| 1284 | + |
| 1285 | +// CHECK_PTX87_SM100a: %[[R12:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1286 | +// CHECK_PTX87_SM100a: store <4 x i8> %[[R12]], ptr %r12 |
| 1287 | +// CHECK_PTX87_SM103a: %[[R12:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1288 | +// CHECK_PTX87_SM103a: store <4 x i8> %[[R12]], ptr %r12 |
| 1289 | + uint8x4 r12 = __nvvm_f32x4_to_e5m2x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); |
| 1290 | + |
| 1291 | +// CHECK_PTX87_SM100a: %[[R13:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1292 | +// CHECK_PTX87_SM100a: store <4 x i8> %[[R13]], ptr %r13 |
| 1293 | +// CHECK_PTX87_SM103a: %[[R13:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1294 | +// CHECK_PTX87_SM103a: store <4 x i8> %[[R13]], ptr %r13 |
| 1295 | + uint8x4 r13 = __nvvm_f32x4_to_e2m3x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); |
| 1296 | + |
| 1297 | +// CHECK_PTX87_SM100a: %[[R14:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1298 | +// CHECK_PTX87_SM100a: store <4 x i8> %[[R14]], ptr %r14 |
| 1299 | +// CHECK_PTX87_SM103a: %[[R14:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1300 | +// CHECK_PTX87_SM103a: store <4 x i8> %[[R14]], ptr %r14 |
| 1301 | + uint8x4 r14 = __nvvm_f32x4_to_e2m3x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); |
| 1302 | + |
| 1303 | +// CHECK_PTX87_SM100a: %[[R15:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1304 | +// CHECK_PTX87_SM100a: store <4 x i8> %[[R15]], ptr %r15 |
| 1305 | +// CHECK_PTX87_SM103a: %[[R15:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1306 | +// CHECK_PTX87_SM103a: store <4 x i8> %[[R15]], ptr %r15 |
| 1307 | + uint8x4 r15 = __nvvm_f32x4_to_e3m2x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); |
| 1308 | + |
| 1309 | +// CHECK_PTX87_SM100a: %[[R16:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1310 | +// CHECK_PTX87_SM100a: store <4 x i8> %[[R16]], ptr %r16 |
| 1311 | +// CHECK_PTX87_SM103a: %[[R16:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1312 | +// CHECK_PTX87_SM103a: store <4 x i8> %[[R16]], ptr %r16 |
| 1313 | + uint8x4 r16 = __nvvm_f32x4_to_e3m2x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); |
| 1314 | + |
| 1315 | +// CHECK_PTX87_SM100a: %[[R17:.*]] = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1316 | +// CHECK_PTX87_SM100a: store i16 %[[R17]], ptr %r17 |
| 1317 | +// CHECK_PTX87_SM103a: %[[R17:.*]] = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1318 | +// CHECK_PTX87_SM103a: store i16 %[[R17]], ptr %r17 |
| 1319 | + short r17 = __nvvm_f32x4_to_e2m1x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); |
| 1320 | + |
| 1321 | +// CHECK_PTX87_SM100a: %[[R18:.*]] = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1322 | +// CHECK_PTX87_SM100a: store i16 %[[R18]], ptr %r18 |
| 1323 | +// CHECK_PTX87_SM103a: %[[R18:.*]] = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0) |
| 1324 | +// CHECK_PTX87_SM103a: store i16 %[[R18]], ptr %r18 |
| 1325 | + short r18 = __nvvm_f32x4_to_e2m1x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0); |
| 1326 | +#endif |
| 1327 | +} |
| 1328 | + |
1206 | 1329 | #define NAN32 0x7FBFFFFF
|
1207 | 1330 | #define NAN16 (__bf16)0x7FBF
|
1208 | 1331 | #define BF16 (__bf16)0.1f
|
|
0 commit comments