@@ -674,6 +674,100 @@ void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
674674 * outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4 (src1 , scale , 7 );
675675}
676676
677+ // CHECK-LABEL: @test_cvt_scalef32_pk(
678+ // CHECK-NEXT: entry:
679+ // CHECK-NEXT: [[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
680+ // CHECK-NEXT: [[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, addrspace(5)
681+ // CHECK-NEXT: [[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5)
682+ // CHECK-NEXT: [[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, addrspace(5)
683+ // CHECK-NEXT: [[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
684+ // CHECK-NEXT: [[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, addrspace(5)
685+ // CHECK-NEXT: [[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5)
686+ // CHECK-NEXT: [[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5)
687+ // CHECK-NEXT: [[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
688+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
689+ // CHECK-NEXT: [[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT2_ADDR]] to ptr
690+ // CHECK-NEXT: [[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF8_ADDR]] to ptr
691+ // CHECK-NEXT: [[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH8_ADDR]] to ptr
692+ // CHECK-NEXT: [[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF8_ADDR]] to ptr
693+ // CHECK-NEXT: [[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT3_ADDR]] to ptr
694+ // CHECK-NEXT: [[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF16_ADDR]] to ptr
695+ // CHECK-NEXT: [[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH16_ADDR]] to ptr
696+ // CHECK-NEXT: [[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF16_ADDR]] to ptr
697+ // CHECK-NEXT: [[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT1_ADDR]] to ptr
698+ // CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
699+ // CHECK-NEXT: store ptr addrspace(1) [[OUT2:%.*]], ptr [[OUT2_ADDR_ASCAST]], align 8
700+ // CHECK-NEXT: store <8 x bfloat> [[SRCBF8:%.*]], ptr [[SRCBF8_ADDR_ASCAST]], align 16
701+ // CHECK-NEXT: store <8 x half> [[SRCH8:%.*]], ptr [[SRCH8_ADDR_ASCAST]], align 16
702+ // CHECK-NEXT: store <8 x float> [[SRCF8:%.*]], ptr [[SRCF8_ADDR_ASCAST]], align 32
703+ // CHECK-NEXT: store ptr addrspace(1) [[OUT3:%.*]], ptr [[OUT3_ADDR_ASCAST]], align 8
704+ // CHECK-NEXT: store <16 x bfloat> [[SRCBF16:%.*]], ptr [[SRCBF16_ADDR_ASCAST]], align 32
705+ // CHECK-NEXT: store <16 x half> [[SRCH16:%.*]], ptr [[SRCH16_ADDR_ASCAST]], align 32
706+ // CHECK-NEXT: store <16 x float> [[SRCF16:%.*]], ptr [[SRCF16_ADDR_ASCAST]], align 64
707+ // CHECK-NEXT: store ptr addrspace(1) [[OUT1:%.*]], ptr [[OUT1_ADDR_ASCAST]], align 8
708+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4
709+ // CHECK-NEXT: [[TMP0:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
710+ // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
711+ // CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.bf16(<8 x bfloat> [[TMP0]], float [[TMP1]])
712+ // CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
713+ // CHECK-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8
714+ // CHECK-NEXT: [[TMP4:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
715+ // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
716+ // CHECK-NEXT: [[TMP6:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.bf16(<8 x bfloat> [[TMP4]], float [[TMP5]])
717+ // CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
718+ // CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
719+ // CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
720+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
721+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f16(<8 x half> [[TMP8]], float [[TMP9]])
722+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
723+ // CHECK-NEXT: store <2 x i32> [[TMP10]], ptr addrspace(1) [[TMP11]], align 8
724+ // CHECK-NEXT: [[TMP12:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
725+ // CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
726+ // CHECK-NEXT: [[TMP14:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f16(<8 x half> [[TMP12]], float [[TMP13]])
727+ // CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
728+ // CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
729+ // CHECK-NEXT: [[TMP16:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
730+ // CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
731+ // CHECK-NEXT: [[TMP18:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> [[TMP16]], float [[TMP17]])
732+ // CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
733+ // CHECK-NEXT: store <2 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 8
734+ // CHECK-NEXT: [[TMP20:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
735+ // CHECK-NEXT: [[TMP21:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
736+ // CHECK-NEXT: [[TMP22:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f32(<8 x float> [[TMP20]], float [[TMP21]])
737+ // CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
738+ // CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
739+ // CHECK-NEXT: [[TMP24:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
740+ // CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
741+ // CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f32(<8 x float> [[TMP24]], float [[TMP25]])
742+ // CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
743+ // CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4
744+ // CHECK-NEXT: [[TMP28:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
745+ // CHECK-NEXT: [[TMP29:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
746+ // CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f16(<8 x half> [[TMP28]], float [[TMP29]])
747+ // CHECK-NEXT: [[TMP31:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
748+ // CHECK-NEXT: store i32 [[TMP30]], ptr addrspace(1) [[TMP31]], align 4
749+ // CHECK-NEXT: [[TMP32:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
750+ // CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
751+ // CHECK-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> [[TMP32]], float [[TMP33]])
752+ // CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
753+ // CHECK-NEXT: store i32 [[TMP34]], ptr addrspace(1) [[TMP35]], align 4
754+ // CHECK-NEXT: ret void
755+ //
756+ void test_cvt_scalef32_pk (global uint2 * out2 , bfloat8 srcbf8 , half8 srch8 , float8 srcf8 ,
757+ global uint3 * out3 , bfloat16 srcbf16 , half16 srch16 , float16 srcf16 ,
758+ global uint * out1 , float scale )
759+ {
760+ * out2 = __builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16 (srcbf8 , scale );
761+ * out2 = __builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16 (srcbf8 , scale );
762+ * out2 = __builtin_amdgcn_cvt_scalef32_pk8_fp8_f16 (srch8 , scale );
763+ * out2 = __builtin_amdgcn_cvt_scalef32_pk8_bf8_f16 (srch8 , scale );
764+ * out2 = __builtin_amdgcn_cvt_scalef32_pk8_fp8_f32 (srcf8 , scale );
765+ * out2 = __builtin_amdgcn_cvt_scalef32_pk8_bf8_f32 (srcf8 , scale );
766+ * out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_f32 (srcf8 , scale );
767+ * out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_f16 (srch8 , scale );
768+ * out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16 (srcbf8 , scale );
769+ }
770+
677771// CHECK-LABEL: @test_sat_pk4_i4_i8(
678772// CHECK-NEXT: entry:
679773// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
0 commit comments