@@ -932,3 +932,55 @@ void test_cvt_scalef32_pk_f32_fp6(global float32* out, uint6 src, float scale)
932932 * out = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6 (src , scale );
933933 * out = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6 (src , scale );
934934}
935+
936+ // CHECK-LABEL: @test_cvt_scalef32_pk32_f16_fpbf6(
937+ // CHECK-NEXT: entry:
938+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
939+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5)
940+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
941+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
942+ // CHECK-NEXT: store <6 x i32> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 32
943+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
944+ // CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32
945+ // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
946+ // CHECK-NEXT: [[TMP2:%.*]] = call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> [[TMP0]], float [[TMP1]])
947+ // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
948+ // CHECK-NEXT: store <32 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 64
949+ // CHECK-NEXT: [[TMP4:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32
950+ // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
951+ // CHECK-NEXT: [[TMP6:%.*]] = call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> [[TMP4]], float [[TMP5]])
952+ // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
953+ // CHECK-NEXT: store <32 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 64
954+ // CHECK-NEXT: ret void
955+ //
956+ void test_cvt_scalef32_pk32_f16_fpbf6 (global half32 * out , uint6 src , float scale )
957+ {
958+ * out = __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6 (src , scale );
959+ * out = __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6 (src , scale );
960+ }
961+
962+ // CHECK-LABEL: @test_cvt_scalef32_pk32_bf16_fpbf6(
963+ // CHECK-NEXT: entry:
964+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
965+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5)
966+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
967+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
968+ // CHECK-NEXT: store <6 x i32> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 32
969+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
970+ // CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32
971+ // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
972+ // CHECK-NEXT: [[TMP2:%.*]] = call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> [[TMP0]], float [[TMP1]])
973+ // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
974+ // CHECK-NEXT: store <32 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 64
975+ // CHECK-NEXT: [[TMP4:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32
976+ // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
977+ // CHECK-NEXT: [[TMP6:%.*]] = call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> [[TMP4]], float [[TMP5]])
978+ // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
979+ // CHECK-NEXT: store <32 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 64
980+ // CHECK-NEXT: ret void
981+ //
982+ void test_cvt_scalef32_pk32_bf16_fpbf6 (global bfloat32 * out , uint6 src , float scale )
983+ {
984+ * out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6 (src , scale );
985+ * out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6 (src , scale );
986+ }
0 commit comments