@@ -658,26 +658,47 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
658658; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] 
659659; GFX10-NEXT:    s_endpgm 
660660; 
661- ; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i32: 
662- ; GFX11:       ; %bb.0: 
663- ; GFX11-NEXT:    s_clause 0x1 
664- ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 
665- ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 
666- ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0 
667- ; GFX11-NEXT:    v_mov_b32_e32 v2, 0 
668- ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) 
669- ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0 
670- ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) 
671- ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc 
672- ; GFX11-NEXT:    s_waitcnt vmcnt(0) 
673- ; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5] glc dlc 
674- ; GFX11-NEXT:    s_waitcnt vmcnt(0) 
675- ; GFX11-NEXT:    v_pk_add_u16 v0, v1, v0 
676- ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) 
677- ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0 
678- ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0 
679- ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1] 
680- ; GFX11-NEXT:    s_endpgm 
661+ ; GFX11-TRUE16-LABEL: v_test_add_v2i16_zext_to_v2i32: 
662+ ; GFX11-TRUE16:       ; %bb.0: 
663+ ; GFX11-TRUE16-NEXT:    s_clause 0x1 
664+ ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 
665+ ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 
666+ ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 
667+ ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) 
668+ ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0 
669+ ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0) 
670+ ; GFX11-TRUE16-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc 
671+ ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) 
672+ ; GFX11-TRUE16-NEXT:    global_load_b32 v0, v0, s[4:5] glc dlc 
673+ ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) 
674+ ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, v1, v0 
675+ ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0 
676+ ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) 
677+ ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v2 
678+ ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.h 
679+ ; GFX11-TRUE16-NEXT:    global_store_b64 v3, v[0:1], s[0:1] 
680+ ; GFX11-TRUE16-NEXT:    s_endpgm 
681+ ; 
682+ ; GFX11-FAKE16-LABEL: v_test_add_v2i16_zext_to_v2i32: 
683+ ; GFX11-FAKE16:       ; %bb.0: 
684+ ; GFX11-FAKE16-NEXT:    s_clause 0x1 
685+ ; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 
686+ ; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 
687+ ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0 
688+ ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0 
689+ ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) 
690+ ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0 
691+ ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) 
692+ ; GFX11-FAKE16-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc 
693+ ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) 
694+ ; GFX11-FAKE16-NEXT:    global_load_b32 v0, v0, s[4:5] glc dlc 
695+ ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) 
696+ ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v0, v1, v0 
697+ ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) 
698+ ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0 
699+ ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0 
700+ ; GFX11-FAKE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1] 
701+ ; GFX11-FAKE16-NEXT:    s_endpgm 
681702  %tid  = call  i32  @llvm.amdgcn.workitem.id.x ()
682703  %gep.out  = getelementptr  inbounds  <2  x i32 >, ptr  addrspace (1 ) %out , i32  %tid 
683704  %gep.in0  = getelementptr  inbounds  <2  x i16 >, ptr  addrspace (1 ) %in0 , i32  %tid 
@@ -971,30 +992,57 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
971992; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] 
972993; GFX10-NEXT:    s_endpgm 
973994; 
974- ; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i64: 
975- ; GFX11:       ; %bb.0: 
976- ; GFX11-NEXT:    s_clause 0x1 
977- ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 
978- ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 
979- ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0 
980- ; GFX11-NEXT:    v_mov_b32_e32 v4, 0 
981- ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) 
982- ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0 
983- ; GFX11-NEXT:    s_waitcnt lgkmcnt(0) 
984- ; GFX11-NEXT:    s_clause 0x1 
985- ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] 
986- ; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5] 
987- ; GFX11-NEXT:    s_waitcnt vmcnt(0) 
988- ; GFX11-NEXT:    v_pk_add_u16 v0, v1, v0 
989- ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 
990- ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0 
991- ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16 
992- ; GFX11-NEXT:    v_bfe_i32 v2, v1, 0, 16 
993- ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 
994- ; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v0 
995- ; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2 
996- ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1] 
997- ; GFX11-NEXT:    s_endpgm 
995+ ; GFX11-TRUE16-LABEL: v_test_add_v2i16_sext_to_v2i64: 
996+ ; GFX11-TRUE16:       ; %bb.0: 
997+ ; GFX11-TRUE16-NEXT:    s_clause 0x1 
998+ ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 
999+ ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 
1000+ ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0 
1001+ ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, 0 
1002+ ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) 
1003+ ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0 
1004+ ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0) 
1005+ ; GFX11-TRUE16-NEXT:    s_clause 0x1 
1006+ ; GFX11-TRUE16-NEXT:    global_load_b32 v1, v0, s[2:3] 
1007+ ; GFX11-TRUE16-NEXT:    global_load_b32 v0, v0, s[4:5] 
1008+ ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) 
1009+ ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v1, v0 
1010+ ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0 
1011+ ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 
1012+ ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h 
1013+ ; GFX11-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16 
1014+ ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 
1015+ ; GFX11-TRUE16-NEXT:    v_bfe_i32 v2, v1, 0, 16 
1016+ ; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0 
1017+ ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) 
1018+ ; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v2 
1019+ ; GFX11-TRUE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1] 
1020+ ; GFX11-TRUE16-NEXT:    s_endpgm 
1021+ ; 
1022+ ; GFX11-FAKE16-LABEL: v_test_add_v2i16_sext_to_v2i64: 
1023+ ; GFX11-FAKE16:       ; %bb.0: 
1024+ ; GFX11-FAKE16-NEXT:    s_clause 0x1 
1025+ ; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 
1026+ ; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 
1027+ ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0 
1028+ ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, 0 
1029+ ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) 
1030+ ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0 
1031+ ; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) 
1032+ ; GFX11-FAKE16-NEXT:    s_clause 0x1 
1033+ ; GFX11-FAKE16-NEXT:    global_load_b32 v1, v0, s[2:3] 
1034+ ; GFX11-FAKE16-NEXT:    global_load_b32 v0, v0, s[4:5] 
1035+ ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) 
1036+ ; GFX11-FAKE16-NEXT:    v_pk_add_u16 v0, v1, v0 
1037+ ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 
1038+ ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0 
1039+ ; GFX11-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 16 
1040+ ; GFX11-FAKE16-NEXT:    v_bfe_i32 v2, v1, 0, 16 
1041+ ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 
1042+ ; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0 
1043+ ; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v2 
1044+ ; GFX11-FAKE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1] 
1045+ ; GFX11-FAKE16-NEXT:    s_endpgm 
9981046  %tid  = call  i32  @llvm.amdgcn.workitem.id.x ()
9991047  %gep.out  = getelementptr  inbounds  <2  x i64 >, ptr  addrspace (1 ) %out , i32  %tid 
10001048  %gep.in0  = getelementptr  inbounds  <2  x i16 >, ptr  addrspace (1 ) %in0 , i32  %tid 
0 commit comments