@@ -768,6 +768,112 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float
768
768
* out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16 (srcbf8 , scale );
769
769
}
770
770
771
+ // CHECK-LABEL: @test_cvt_scalef32_sr_pk(
772
+ // CHECK-NEXT: entry:
773
+ // CHECK-NEXT: [[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
774
+ // CHECK-NEXT: [[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, addrspace(5)
775
+ // CHECK-NEXT: [[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5)
776
+ // CHECK-NEXT: [[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, addrspace(5)
777
+ // CHECK-NEXT: [[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
778
+ // CHECK-NEXT: [[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, addrspace(5)
779
+ // CHECK-NEXT: [[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5)
780
+ // CHECK-NEXT: [[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5)
781
+ // CHECK-NEXT: [[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
782
+ // CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
783
+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
784
+ // CHECK-NEXT: [[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT2_ADDR]] to ptr
785
+ // CHECK-NEXT: [[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF8_ADDR]] to ptr
786
+ // CHECK-NEXT: [[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH8_ADDR]] to ptr
787
+ // CHECK-NEXT: [[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF8_ADDR]] to ptr
788
+ // CHECK-NEXT: [[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT3_ADDR]] to ptr
789
+ // CHECK-NEXT: [[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF16_ADDR]] to ptr
790
+ // CHECK-NEXT: [[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH16_ADDR]] to ptr
791
+ // CHECK-NEXT: [[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF16_ADDR]] to ptr
792
+ // CHECK-NEXT: [[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT1_ADDR]] to ptr
793
+ // CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr
794
+ // CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
795
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT2:%.*]], ptr [[OUT2_ADDR_ASCAST]], align 8
796
+ // CHECK-NEXT: store <8 x bfloat> [[SRCBF8:%.*]], ptr [[SRCBF8_ADDR_ASCAST]], align 16
797
+ // CHECK-NEXT: store <8 x half> [[SRCH8:%.*]], ptr [[SRCH8_ADDR_ASCAST]], align 16
798
+ // CHECK-NEXT: store <8 x float> [[SRCF8:%.*]], ptr [[SRCF8_ADDR_ASCAST]], align 32
799
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT3:%.*]], ptr [[OUT3_ADDR_ASCAST]], align 8
800
+ // CHECK-NEXT: store <16 x bfloat> [[SRCBF16:%.*]], ptr [[SRCBF16_ADDR_ASCAST]], align 32
801
+ // CHECK-NEXT: store <16 x half> [[SRCH16:%.*]], ptr [[SRCH16_ADDR_ASCAST]], align 32
802
+ // CHECK-NEXT: store <16 x float> [[SRCF16:%.*]], ptr [[SRCF16_ADDR_ASCAST]], align 64
803
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT1:%.*]], ptr [[OUT1_ADDR_ASCAST]], align 8
804
+ // CHECK-NEXT: store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4
805
+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4
806
+ // CHECK-NEXT: [[TMP0:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
807
+ // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
808
+ // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
809
+ // CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.bf16(<8 x bfloat> [[TMP0]], i32 [[TMP1]], float [[TMP2]])
810
+ // CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
811
+ // CHECK-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(1) [[TMP4]], align 8
812
+ // CHECK-NEXT: [[TMP5:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
813
+ // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
814
+ // CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
815
+ // CHECK-NEXT: [[TMP8:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.bf16(<8 x bfloat> [[TMP5]], i32 [[TMP6]], float [[TMP7]])
816
+ // CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
817
+ // CHECK-NEXT: store <2 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 8
818
+ // CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
819
+ // CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
820
+ // CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
821
+ // CHECK-NEXT: [[TMP13:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f16(<8 x half> [[TMP10]], i32 [[TMP11]], float [[TMP12]])
822
+ // CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
823
+ // CHECK-NEXT: store <2 x i32> [[TMP13]], ptr addrspace(1) [[TMP14]], align 8
824
+ // CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
825
+ // CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
826
+ // CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
827
+ // CHECK-NEXT: [[TMP18:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f16(<8 x half> [[TMP15]], i32 [[TMP16]], float [[TMP17]])
828
+ // CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
829
+ // CHECK-NEXT: store <2 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 8
830
+ // CHECK-NEXT: [[TMP20:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
831
+ // CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
832
+ // CHECK-NEXT: [[TMP22:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
833
+ // CHECK-NEXT: [[TMP23:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f32(<8 x float> [[TMP20]], i32 [[TMP21]], float [[TMP22]])
834
+ // CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
835
+ // CHECK-NEXT: store <2 x i32> [[TMP23]], ptr addrspace(1) [[TMP24]], align 8
836
+ // CHECK-NEXT: [[TMP25:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
837
+ // CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
838
+ // CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
839
+ // CHECK-NEXT: [[TMP28:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f32(<8 x float> [[TMP25]], i32 [[TMP26]], float [[TMP27]])
840
+ // CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
841
+ // CHECK-NEXT: store <2 x i32> [[TMP28]], ptr addrspace(1) [[TMP29]], align 8
842
+ // CHECK-NEXT: [[TMP30:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
843
+ // CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
844
+ // CHECK-NEXT: [[TMP32:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
845
+ // CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f32(<8 x float> [[TMP30]], i32 [[TMP31]], float [[TMP32]])
846
+ // CHECK-NEXT: [[TMP34:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
847
+ // CHECK-NEXT: store i32 [[TMP33]], ptr addrspace(1) [[TMP34]], align 4
848
+ // CHECK-NEXT: [[TMP35:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
849
+ // CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
850
+ // CHECK-NEXT: [[TMP37:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
851
+ // CHECK-NEXT: [[TMP38:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f16(<8 x half> [[TMP35]], i32 [[TMP36]], float [[TMP37]])
852
+ // CHECK-NEXT: [[TMP39:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
853
+ // CHECK-NEXT: store i32 [[TMP38]], ptr addrspace(1) [[TMP39]], align 4
854
+ // CHECK-NEXT: [[TMP40:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
855
+ // CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
856
+ // CHECK-NEXT: [[TMP42:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
857
+ // CHECK-NEXT: [[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]])
858
+ // CHECK-NEXT: [[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
859
+ // CHECK-NEXT: store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4
860
+ // CHECK-NEXT: ret void
861
+ //
862
+ void test_cvt_scalef32_sr_pk (global uint2 * out2 , bfloat8 srcbf8 , half8 srch8 , float8 srcf8 ,
863
+ global uint3 * out3 , bfloat16 srcbf16 , half16 srch16 , float16 srcf16 ,
864
+ global uint * out1 , uint sr , float scale )
865
+ {
866
+ * out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16 (srcbf8 , sr , scale );
867
+ * out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16 (srcbf8 , sr , scale );
868
+ * out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16 (srch8 , sr , scale );
869
+ * out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f16 (srch8 , sr , scale );
870
+ * out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f32 (srcf8 , sr , scale );
871
+ * out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32 (srcf8 , sr , scale );
872
+ * out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32 (srcf8 , sr , scale );
873
+ * out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16 (srch8 , sr , scale );
874
+ * out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16 (srcbf8 , sr , scale );
875
+ }
876
+
771
877
// CHECK-LABEL: @test_sat_pk4_i4_i8(
772
878
// CHECK-NEXT: entry:
773
879
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
0 commit comments