@@ -984,3 +984,107 @@ void test_cvt_scalef32_pk32_bf16_fpbf6(global bfloat32 *out, uint6 src, float sc
984984 * out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6 (src , scale );
985985 * out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6 (src , scale );
986986}
987+
988+ // CHECK-LABEL: @test_cvt_scalef32_pk_f16_fp8(
989+ // CHECK-NEXT: entry:
990+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
991+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
992+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
993+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
994+ // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
995+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
996+ // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
997+ // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
998+ // CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 [[TMP0]], float [[TMP1]], i1 true)
999+ // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1000+ // CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1001+ // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1002+ // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1003+ // CHECK-NEXT: [[TMP6:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 [[TMP4]], float [[TMP5]], i1 false)
1004+ // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1005+ // CHECK-NEXT: store <2 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
1006+ // CHECK-NEXT: ret void
1007+ //
1008+ void test_cvt_scalef32_pk_f16_fp8 (global half2 * out , unsigned int src , float scale )
1009+ {
1010+ * out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp8 (src , scale , true);
1011+ * out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp8 (src , scale , false);
1012+ }
1013+
1014+ // CHECK-LABEL: @test_cvt_scalef32_pk_f16_bf8(
1015+ // CHECK-NEXT: entry:
1016+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1017+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1018+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1019+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1020+ // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1021+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1022+ // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1023+ // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1024+ // CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 [[TMP0]], float [[TMP1]], i1 true)
1025+ // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1026+ // CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1027+ // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1028+ // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1029+ // CHECK-NEXT: [[TMP6:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 [[TMP4]], float [[TMP5]], i1 false)
1030+ // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1031+ // CHECK-NEXT: store <2 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
1032+ // CHECK-NEXT: ret void
1033+ //
1034+ void test_cvt_scalef32_pk_f16_bf8 (global half2 * out , unsigned int src , float scale )
1035+ {
1036+ * out = __builtin_amdgcn_cvt_scalef32_pk_f16_bf8 (src , scale , true);
1037+ * out = __builtin_amdgcn_cvt_scalef32_pk_f16_bf8 (src , scale , false);
1038+ }
1039+
1040+ // CHECK-LABEL: @test_cvt_scalef32_pk_bf16_fp8(
1041+ // CHECK-NEXT: entry:
1042+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1043+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1044+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1045+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1046+ // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1047+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1048+ // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1049+ // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1050+ // CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 [[TMP0]], float [[TMP1]], i1 true)
1051+ // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1052+ // CHECK-NEXT: store <2 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1053+ // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1054+ // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1055+ // CHECK-NEXT: [[TMP6:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 [[TMP4]], float [[TMP5]], i1 false)
1056+ // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1057+ // CHECK-NEXT: store <2 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
1058+ // CHECK-NEXT: ret void
1059+ //
1060+ void test_cvt_scalef32_pk_bf16_fp8 (global bfloat2 * out , unsigned int src , float scale )
1061+ {
1062+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8 (src , scale , true);
1063+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8 (src , scale , false);
1064+ }
1065+
1066+ // CHECK-LABEL: @test_cvt_scalef32_pk_bf16_bf8(
1067+ // CHECK-NEXT: entry:
1068+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1069+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1070+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1071+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1072+ // CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1073+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1074+ // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1075+ // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1076+ // CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 [[TMP0]], float [[TMP1]], i1 true)
1077+ // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1078+ // CHECK-NEXT: store <2 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1079+ // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1080+ // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1081+ // CHECK-NEXT: [[TMP6:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 [[TMP4]], float [[TMP5]], i1 false)
1082+ // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1083+ // CHECK-NEXT: store <2 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
1084+ // CHECK-NEXT: ret void
1085+ //
1086+ void test_cvt_scalef32_pk_bf16_bf8 (global bfloat2 * out , unsigned int src , float scale )
1087+ {
1088+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8 (src , scale , true);
1089+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8 (src , scale , false);
1090+ }
0 commit comments