@@ -649,36 +649,35 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
649649;
650650; GFX9-LABEL: s_test_imin_sle_v4i8:
651651; GFX9: ; %bb.0:
652- ; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c
653652; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28
653+ ; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c
654654; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
655655; GFX9-NEXT: v_mov_b32_e32 v0, 0
656656; GFX9-NEXT: s_waitcnt lgkmcnt(0)
657- ; GFX9-NEXT: s_lshr_b32 s5, s2, 16
658- ; GFX9-NEXT: s_lshr_b32 s8, s3, 16
659- ; GFX9-NEXT: s_ashr_i32 s9, s3, 24
660- ; GFX9-NEXT: s_ashr_i32 s6, s2, 24
661- ; GFX9-NEXT: s_bfe_i32 s8, s8, 0x80000
662- ; GFX9-NEXT: v_mov_b32_e32 v1, s9
663- ; GFX9-NEXT: s_bfe_i32 s5, s5, 0x80000
657+ ; GFX9-NEXT: s_sext_i32_i16 s5, s2
664658; GFX9-NEXT: s_sext_i32_i16 s7, s3
665- ; GFX9-NEXT: v_min_i16_e32 v1, s6, v1
666- ; GFX9-NEXT: v_mov_b32_e32 v2, s8
667- ; GFX9-NEXT: s_sext_i32_i16 s4, s2
668- ; GFX9-NEXT: s_lshr_b32 s7, s7, 8
669- ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1
670- ; GFX9-NEXT: v_min_i16_e32 v2, s5, v2
671- ; GFX9-NEXT: s_lshr_b32 s4, s4, 8
672- ; GFX9-NEXT: s_bfe_i32 s3, s3, 0x80000
673- ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
674- ; GFX9-NEXT: v_mov_b32_e32 v2, s7
675- ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x80000
676- ; GFX9-NEXT: v_min_i16_e32 v2, s4, v2
677- ; GFX9-NEXT: v_mov_b32_e32 v3, s3
678- ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
679- ; GFX9-NEXT: v_min_i16_e32 v3, s2, v3
680- ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
681- ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
659+ ; GFX9-NEXT: s_ashr_i32 s7, s7, 8
660+ ; GFX9-NEXT: s_ashr_i32 s5, s5, 8
661+ ; GFX9-NEXT: s_ashr_i32 s4, s2, 24
662+ ; GFX9-NEXT: s_ashr_i32 s6, s3, 24
663+ ; GFX9-NEXT: s_min_i32 s5, s5, s7
664+ ; GFX9-NEXT: s_sext_i32_i8 s7, s3
665+ ; GFX9-NEXT: s_sext_i32_i8 s8, s2
666+ ; GFX9-NEXT: s_bfe_i32 s3, s3, 0x80010
667+ ; GFX9-NEXT: s_bfe_i32 s2, s2, 0x80010
668+ ; GFX9-NEXT: s_min_i32 s7, s8, s7
669+ ; GFX9-NEXT: s_min_i32 s4, s4, s6
670+ ; GFX9-NEXT: s_min_i32 s2, s2, s3
671+ ; GFX9-NEXT: s_lshl_b32 s5, s5, 8
672+ ; GFX9-NEXT: s_and_b32 s7, s7, 0xff
673+ ; GFX9-NEXT: s_lshl_b32 s4, s4, 8
674+ ; GFX9-NEXT: s_and_b32 s2, s2, 0xff
675+ ; GFX9-NEXT: s_or_b32 s5, s7, s5
676+ ; GFX9-NEXT: s_or_b32 s2, s2, s4
677+ ; GFX9-NEXT: s_and_b32 s5, s5, 0xffff
678+ ; GFX9-NEXT: s_lshl_b32 s2, s2, 16
679+ ; GFX9-NEXT: s_or_b32 s2, s5, s2
680+ ; GFX9-NEXT: v_mov_b32_e32 v1, s2
682681; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
683682; GFX9-NEXT: s_endpgm
684683;
@@ -688,111 +687,70 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
688687; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28
689688; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c
690689; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
690+ ; GFX10-NEXT: v_mov_b32_e32 v0, 0
691691; GFX10-NEXT: s_waitcnt lgkmcnt(0)
692- ; GFX10-NEXT: s_sext_i32_i16 s4 , s2
692+ ; GFX10-NEXT: s_sext_i32_i16 s5 , s2
693693; GFX10-NEXT: s_sext_i32_i16 s7, s3
694- ; GFX10-NEXT: s_ashr_i32 s6, s2, 24
695- ; GFX10-NEXT: s_ashr_i32 s9, s3, 24
696- ; GFX10-NEXT: s_lshr_b32 s4, s4, 8
697- ; GFX10-NEXT: s_lshr_b32 s7, s7, 8
698- ; GFX10-NEXT: v_min_i16 v0, s6, s9
699- ; GFX10-NEXT: v_min_i16 v1, s4, s7
700- ; GFX10-NEXT: s_lshr_b32 s5, s2, 16
701- ; GFX10-NEXT: s_lshr_b32 s8, s3, 16
702- ; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80000
703- ; GFX10-NEXT: s_bfe_i32 s5, s5, 0x80000
704- ; GFX10-NEXT: s_bfe_i32 s4, s8, 0x80000
705- ; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80000
706- ; GFX10-NEXT: v_min_i16 v2, s5, s4
707- ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0
708- ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1
709- ; GFX10-NEXT: v_min_i16 v3, s2, s3
710- ; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
711- ; GFX10-NEXT: v_mov_b32_e32 v2, 0
712- ; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
713- ; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
714- ; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
694+ ; GFX10-NEXT: s_ashr_i32 s4, s2, 24
695+ ; GFX10-NEXT: s_ashr_i32 s6, s3, 24
696+ ; GFX10-NEXT: s_sext_i32_i8 s8, s3
697+ ; GFX10-NEXT: s_sext_i32_i8 s9, s2
698+ ; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80010
699+ ; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80010
700+ ; GFX10-NEXT: s_ashr_i32 s7, s7, 8
701+ ; GFX10-NEXT: s_ashr_i32 s5, s5, 8
702+ ; GFX10-NEXT: s_min_i32 s8, s9, s8
703+ ; GFX10-NEXT: s_min_i32 s4, s4, s6
704+ ; GFX10-NEXT: s_min_i32 s2, s2, s3
705+ ; GFX10-NEXT: s_min_i32 s3, s5, s7
706+ ; GFX10-NEXT: s_and_b32 s5, s8, 0xff
707+ ; GFX10-NEXT: s_lshl_b32 s4, s4, 8
708+ ; GFX10-NEXT: s_lshl_b32 s3, s3, 8
709+ ; GFX10-NEXT: s_and_b32 s2, s2, 0xff
710+ ; GFX10-NEXT: s_or_b32 s3, s5, s3
711+ ; GFX10-NEXT: s_or_b32 s2, s2, s4
712+ ; GFX10-NEXT: s_and_b32 s3, s3, 0xffff
713+ ; GFX10-NEXT: s_lshl_b32 s2, s2, 16
714+ ; GFX10-NEXT: s_or_b32 s2, s3, s2
715+ ; GFX10-NEXT: v_mov_b32_e32 v1, s2
716+ ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
715717; GFX10-NEXT: s_endpgm
716718;
717- ; GFX11-TRUE16-LABEL: s_test_imin_sle_v4i8:
718- ; GFX11-TRUE16: ; %bb.0:
719- ; GFX11-TRUE16-NEXT: s_clause 0x1
720- ; GFX11-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x28
721- ; GFX11-TRUE16-NEXT: s_load_b32 s1, s[4:5], 0x4c
722- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
723- ; GFX11-TRUE16-NEXT: s_sext_i32_i16 s2, s0
724- ; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 16
725- ; GFX11-TRUE16-NEXT: s_sext_i32_i16 s7, s1
726- ; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s1, 16
727- ; GFX11-TRUE16-NEXT: s_ashr_i32 s6, s0, 24
728- ; GFX11-TRUE16-NEXT: s_ashr_i32 s9, s1, 24
729- ; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 8
730- ; GFX11-TRUE16-NEXT: s_bfe_i32 s3, s3, 0x80000
731- ; GFX11-TRUE16-NEXT: s_bfe_i32 s0, s0, 0x80000
732- ; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s7, 8
733- ; GFX11-TRUE16-NEXT: s_bfe_i32 s8, s8, 0x80000
734- ; GFX11-TRUE16-NEXT: s_bfe_i32 s1, s1, 0x80000
735- ; GFX11-TRUE16-NEXT: v_min_i16 v0.l, s6, s9
736- ; GFX11-TRUE16-NEXT: v_min_i16 v1.l, s3, s8
737- ; GFX11-TRUE16-NEXT: v_min_i16 v2.l, s2, s7
738- ; GFX11-TRUE16-NEXT: v_min_i16 v3.l, s0, s1
739- ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
740- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 8, v0
741- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
742- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
743- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
744- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
745- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
746- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2
747- ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
748- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
749- ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
750- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
751- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
752- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
753- ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
754- ; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
755- ; GFX11-TRUE16-NEXT: s_endpgm
756- ;
757- ; GFX11-FAKE16-LABEL: s_test_imin_sle_v4i8:
758- ; GFX11-FAKE16: ; %bb.0:
759- ; GFX11-FAKE16-NEXT: s_clause 0x1
760- ; GFX11-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x28
761- ; GFX11-FAKE16-NEXT: s_load_b32 s1, s[4:5], 0x4c
762- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
763- ; GFX11-FAKE16-NEXT: s_sext_i32_i16 s2, s0
764- ; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 16
765- ; GFX11-FAKE16-NEXT: s_sext_i32_i16 s7, s1
766- ; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s1, 16
767- ; GFX11-FAKE16-NEXT: s_ashr_i32 s6, s0, 24
768- ; GFX11-FAKE16-NEXT: s_bfe_i32 s0, s0, 0x80000
769- ; GFX11-FAKE16-NEXT: s_ashr_i32 s9, s1, 24
770- ; GFX11-FAKE16-NEXT: s_bfe_i32 s1, s1, 0x80000
771- ; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 8
772- ; GFX11-FAKE16-NEXT: s_bfe_i32 s3, s3, 0x80000
773- ; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s7, 8
774- ; GFX11-FAKE16-NEXT: s_bfe_i32 s8, s8, 0x80000
775- ; GFX11-FAKE16-NEXT: v_min_i16 v0, s6, s9
776- ; GFX11-FAKE16-NEXT: v_min_i16 v1, s0, s1
777- ; GFX11-FAKE16-NEXT: v_min_i16 v2, s3, s8
778- ; GFX11-FAKE16-NEXT: v_min_i16 v3, s2, s7
779- ; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
780- ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 8, v0
781- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
782- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2
783- ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3
784- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
785- ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v2, v0
786- ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v3
787- ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
788- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
789- ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
790- ; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
791- ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
792- ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v1, v0
793- ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
794- ; GFX11-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1]
795- ; GFX11-FAKE16-NEXT: s_endpgm
719+ ; GFX11-LABEL: s_test_imin_sle_v4i8:
720+ ; GFX11: ; %bb.0:
721+ ; GFX11-NEXT: s_clause 0x2
722+ ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28
723+ ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c
724+ ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
725+ ; GFX11-NEXT: v_mov_b32_e32 v0, 0
726+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
727+ ; GFX11-NEXT: s_sext_i32_i16 s5, s2
728+ ; GFX11-NEXT: s_sext_i32_i16 s7, s3
729+ ; GFX11-NEXT: s_ashr_i32 s4, s2, 24
730+ ; GFX11-NEXT: s_ashr_i32 s6, s3, 24
731+ ; GFX11-NEXT: s_sext_i32_i8 s8, s3
732+ ; GFX11-NEXT: s_sext_i32_i8 s9, s2
733+ ; GFX11-NEXT: s_bfe_i32 s3, s3, 0x80010
734+ ; GFX11-NEXT: s_bfe_i32 s2, s2, 0x80010
735+ ; GFX11-NEXT: s_ashr_i32 s7, s7, 8
736+ ; GFX11-NEXT: s_ashr_i32 s5, s5, 8
737+ ; GFX11-NEXT: s_min_i32 s8, s9, s8
738+ ; GFX11-NEXT: s_min_i32 s4, s4, s6
739+ ; GFX11-NEXT: s_min_i32 s2, s2, s3
740+ ; GFX11-NEXT: s_min_i32 s3, s5, s7
741+ ; GFX11-NEXT: s_and_b32 s5, s8, 0xff
742+ ; GFX11-NEXT: s_lshl_b32 s4, s4, 8
743+ ; GFX11-NEXT: s_lshl_b32 s3, s3, 8
744+ ; GFX11-NEXT: s_and_b32 s2, s2, 0xff
745+ ; GFX11-NEXT: s_or_b32 s3, s5, s3
746+ ; GFX11-NEXT: s_or_b32 s2, s2, s4
747+ ; GFX11-NEXT: s_and_b32 s3, s3, 0xffff
748+ ; GFX11-NEXT: s_lshl_b32 s2, s2, 16
749+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
750+ ; GFX11-NEXT: s_or_b32 s2, s3, s2
751+ ; GFX11-NEXT: v_mov_b32_e32 v1, s2
752+ ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
753+ ; GFX11-NEXT: s_endpgm
796754 %cmp = icmp sle <4 x i8 > %a , %b
797755 %val = select <4 x i1 > %cmp , <4 x i8 > %a , <4 x i8 > %b
798756 store <4 x i8 > %val , ptr addrspace (1 ) %out
0 commit comments