@@ -14,6 +14,8 @@ typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2;
1414typedef float __attribute__ ((ext_vector_type (16 ))) float16 ;
1515typedef half __attribute__ ((ext_vector_type (2 ))) half2 ;
1616typedef float __attribute__ ((ext_vector_type (2 ))) float2 ;
17+ typedef half __attribute__ ((ext_vector_type (2 ))) half2 ;
18+ typedef __bf16 __attribute__ ((ext_vector_type (2 ))) bfloat2 ;
1719
1820// CHECK-LABEL: @test_prng_b32(
1921// CHECK-NEXT: entry:
@@ -619,3 +621,123 @@ void test_cvt_scalef32_pk_f32_bf8(global float2* out, unsigned int src, float sc
619621 * out = __builtin_amdgcn_cvt_scalef32_pk_f32_bf8 (src , scale , true);
620622 * out = __builtin_amdgcn_cvt_scalef32_pk_f32_bf8 (src , scale , false);
621623}
624+
625+ // CHECK-LABEL: @test_cvt_scalef32_pk_fp8_f16(
626+ // CHECK-NEXT: entry:
627+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
628+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
629+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
630+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
631+ // CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
632+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
633+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
634+ // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
635+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
636+ // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
637+ // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i1 true)
638+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
639+ // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
640+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
641+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4
642+ // CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
643+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
644+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i1 false)
645+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
646+ // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
647+ // CHECK-NEXT: ret void
648+ //
649+ void test_cvt_scalef32_pk_fp8_f16 (global short2 * out , half2 src , float scale )
650+ {
651+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16 (* out , src , scale , true);
652+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16 (* out , src , scale , false);
653+ }
654+
655+ // CHECK-LABEL: @test_cvt_scalef32_pk_fp8_bf16(
656+ // CHECK-NEXT: entry:
657+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
658+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
659+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
660+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
661+ // CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
662+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
663+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
664+ // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
665+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
666+ // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
667+ // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i1 true)
668+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
669+ // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
670+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
671+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4
672+ // CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
673+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
674+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i1 false)
675+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
676+ // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
677+ // CHECK-NEXT: ret void
678+ //
679+ void test_cvt_scalef32_pk_fp8_bf16 (global short2 * out , bfloat2 src , float scale )
680+ {
681+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16 (* out , src , scale , true);
682+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16 (* out , src , scale , false);
683+ }
684+
685+ // CHECK-LABEL: @test_cvt_scalef32_pk_bf8_f16(
686+ // CHECK-NEXT: entry:
687+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
688+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
689+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
690+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
691+ // CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
692+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
693+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
694+ // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
695+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
696+ // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
697+ // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i1 true)
698+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
699+ // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
700+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
701+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4
702+ // CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
703+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
704+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i1 false)
705+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
706+ // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
707+ // CHECK-NEXT: ret void
708+ //
709+ void test_cvt_scalef32_pk_bf8_f16 (global short2 * out , half2 src , float scale )
710+ {
711+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f16 (* out , src , scale , true);
712+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f16 (* out , src , scale , false);
713+ }
714+
715+ // CHECK-LABEL: @test_cvt_scalef32_pk_bf8_bf16(
716+ // CHECK-NEXT: entry:
717+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
718+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
719+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
720+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
721+ // CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
722+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
723+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
724+ // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
725+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
726+ // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
727+ // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i1 true)
728+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
729+ // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
730+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
731+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4
732+ // CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
733+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
734+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i1 false)
735+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
736+ // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
737+ // CHECK-NEXT: ret void
738+ //
739+ void test_cvt_scalef32_pk_bf8_bf16 (global short2 * out , bfloat2 src , float scale )
740+ {
741+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16 (* out , src , scale , true);
742+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16 (* out , src , scale , false);
743+ }
0 commit comments