@@ -1088,3 +1088,97 @@ void test_cvt_scalef32_pk_bf16_bf8(global bfloat2* out, unsigned int src, float
10881088 * out = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8 (src , scale , true);
10891089 * out = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8 (src , scale , false);
10901090}
1091+
1092+ // CHECK-LABEL: @test_cvt_scalef32_pk_fp4_f16(
1093+ // CHECK-NEXT: entry:
1094+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1095+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
1096+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1097+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1098+ // CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1099+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1100+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1101+ // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1102+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1103+ // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1104+ // CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i32 0)
1105+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1106+ // CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
1107+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1108+ // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
1109+ // CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1110+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1111+ // CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i32 1)
1112+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1113+ // CHECK-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
1114+ // CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1115+ // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
1116+ // CHECK-NEXT: [[TMP14:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1117+ // CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1118+ // CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP13]], <2 x half> [[TMP14]], float [[TMP15]], i32 2)
1119+ // CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1120+ // CHECK-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[TMP17]], align 4
1121+ // CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1122+ // CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(1) [[TMP18]], align 4
1123+ // CHECK-NEXT: [[TMP20:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1124+ // CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1125+ // CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP19]], <2 x half> [[TMP20]], float [[TMP21]], i32 3)
1126+ // CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1127+ // CHECK-NEXT: store i32 [[TMP22]], ptr addrspace(1) [[TMP23]], align 4
1128+ // CHECK-NEXT: ret void
1129+ //
1130+ void test_cvt_scalef32_pk_fp4_f16 (global unsigned int * out , half2 src , float scale )
1131+ {
1132+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16 (* out , src , scale , 0 );
1133+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16 (* out , src , scale , 1 );
1134+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16 (* out , src , scale , 2 );
1135+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16 (* out , src , scale , 3 );
1136+ }
1137+
1138+ // CHECK-LABEL: @test_cvt_scalef32_pk_fp4_bf16(
1139+ // CHECK-NEXT: entry:
1140+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1141+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
1142+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1143+ // CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1144+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1145+ // CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1146+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1147+ // CHECK-NEXT: store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
1148+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1149+ // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1150+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1151+ // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1152+ // CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i32 0)
1153+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1154+ // CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
1155+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1156+ // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
1157+ // CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1158+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1159+ // CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i32 1)
1160+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1161+ // CHECK-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
1162+ // CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1163+ // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
1164+ // CHECK-NEXT: [[TMP14:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1165+ // CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1166+ // CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP13]], <2 x bfloat> [[TMP14]], float [[TMP15]], i32 2)
1167+ // CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1168+ // CHECK-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[TMP17]], align 4
1169+ // CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1170+ // CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(1) [[TMP18]], align 4
1171+ // CHECK-NEXT: [[TMP20:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1172+ // CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1173+ // CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP19]], <2 x bfloat> [[TMP20]], float [[TMP21]], i32 3)
1174+ // CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1175+ // CHECK-NEXT: store i32 [[TMP22]], ptr addrspace(1) [[TMP23]], align 4
1176+ // CHECK-NEXT: ret void
1177+ //
1178+ void test_cvt_scalef32_pk_fp4_bf16 (global unsigned int * out , bfloat2 src , float scale , uint old )
1179+ {
1180+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16 (* out , src , scale , 0 );
1181+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16 (* out , src , scale , 1 );
1182+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16 (* out , src , scale , 2 );
1183+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16 (* out , src , scale , 3 );
1184+ }
0 commit comments