@@ -1182,3 +1182,159 @@ void test_cvt_scalef32_pk_fp4_bf16(global unsigned int* out, bfloat2 src, float
11821182 * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16 (* out , src , scale , 2 );
11831183 * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16 (* out , src , scale , 3 );
11841184}
1185+
1186+ // CHECK-LABEL: @test_cvt_scalef32_sr_pk_fp4_f16(
1187+ // CHECK-NEXT: entry:
1188+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1189+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
1190+ // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1191+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1192+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1193+ // CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1194+ // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1195+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1196+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1197+ // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1198+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1199+ // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1200+ // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1201+ // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP1]], <2 x half> [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0)
1202+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1203+ // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1204+ // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1205+ // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
1206+ // CHECK-NEXT: [[TMP9:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1207+ // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1208+ // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1209+ // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP8]], <2 x half> [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1)
1210+ // CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1211+ // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
1212+ // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1213+ // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4
1214+ // CHECK-NEXT: [[TMP16:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1215+ // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1216+ // CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1217+ // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP15]], <2 x half> [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2)
1218+ // CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1219+ // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4
1220+ // CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1221+ // CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4
1222+ // CHECK-NEXT: [[TMP23:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1223+ // CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1224+ // CHECK-NEXT: [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1225+ // CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP22]], <2 x half> [[TMP23]], i32 [[TMP24]], float [[TMP25]], i32 3)
1226+ // CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1227+ // CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4
1228+ // CHECK-NEXT: ret void
1229+ //
1230+ void test_cvt_scalef32_sr_pk_fp4_f16 (global unsigned * out , half2 src , uint seed , float scale )
1231+ {
1232+ * out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16 (* out , src , seed , scale , 0 );
1233+ * out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16 (* out , src , seed , scale , 1 );
1234+ * out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16 (* out , src , seed , scale , 2 );
1235+ * out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16 (* out , src , seed , scale , 3 );
1236+ }
1237+
1238+ // CHECK-LABEL: @test_cvt_scalef32_sr_pk_fp4_bf16(
1239+ // CHECK-NEXT: entry:
1240+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1241+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
1242+ // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1243+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1244+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1245+ // CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1246+ // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1247+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1248+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1249+ // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1250+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1251+ // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1252+ // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1253+ // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP1]], <2 x bfloat> [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0)
1254+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1255+ // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1256+ // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1257+ // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
1258+ // CHECK-NEXT: [[TMP9:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1259+ // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1260+ // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1261+ // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP8]], <2 x bfloat> [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1)
1262+ // CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1263+ // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
1264+ // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1265+ // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4
1266+ // CHECK-NEXT: [[TMP16:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1267+ // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1268+ // CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1269+ // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP15]], <2 x bfloat> [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2)
1270+ // CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1271+ // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4
1272+ // CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1273+ // CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4
1274+ // CHECK-NEXT: [[TMP23:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1275+ // CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1276+ // CHECK-NEXT: [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1277+ // CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP22]], <2 x bfloat> [[TMP23]], i32 [[TMP24]], float [[TMP25]], i32 3)
1278+ // CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1279+ // CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4
1280+ // CHECK-NEXT: ret void
1281+ //
1282+ void test_cvt_scalef32_sr_pk_fp4_bf16 (global unsigned * out , bfloat2 src , uint seed , float scale )
1283+ {
1284+ * out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16 (* out , src , seed , scale , 0 );
1285+ * out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16 (* out , src , seed , scale , 1 );
1286+ * out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16 (* out , src , seed , scale , 2 );
1287+ * out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16 (* out , src , seed , scale , 3 );
1288+ }
1289+
1290+ // CHECK-LABEL: @test_cvt_scalef32_sr_pk_fp4_f32(
1291+ // CHECK-NEXT: entry:
1292+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1293+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x float>, align 8, addrspace(5)
1294+ // CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1295+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1296+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1297+ // CHECK-NEXT: store <2 x float> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
1298+ // CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1299+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1300+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1301+ // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1302+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8
1303+ // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1304+ // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1305+ // CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP1]], <2 x float> [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0)
1306+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1307+ // CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1308+ // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1309+ // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
1310+ // CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8
1311+ // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1312+ // CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1313+ // CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP8]], <2 x float> [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1)
1314+ // CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1315+ // CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
1316+ // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1317+ // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4
1318+ // CHECK-NEXT: [[TMP16:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8
1319+ // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1320+ // CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1321+ // CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP15]], <2 x float> [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2)
1322+ // CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1323+ // CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4
1324+ // CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1325+ // CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4
1326+ // CHECK-NEXT: [[TMP23:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8
1327+ // CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1328+ // CHECK-NEXT: [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1329+ // CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP22]], <2 x float> [[TMP23]], i32 [[TMP24]], float [[TMP25]], i32 3)
1330+ // CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1331+ // CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4
1332+ // CHECK-NEXT: ret void
1333+ //
1334+ void test_cvt_scalef32_sr_pk_fp4_f32 (global unsigned * out , float2 src , uint seed , float scale )
1335+ {
1336+ * out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32 (* out , src , seed , scale , 0 );
1337+ * out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32 (* out , src , seed , scale , 1 );
1338+ * out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32 (* out , src , seed , scale , 2 );
1339+ * out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32 (* out , src , seed , scale , 3 );
1340+ }
0 commit comments