@@ -1600,14 +1600,8 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
16001600; NOOPT-NEXT: ; implicit-def: $vgpr0
16011601; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
16021602; NOOPT-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
1603- ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
1604- ; NOOPT-NEXT: s_waitcnt expcnt(0)
1605- ; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
1606- ; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
1607- ; NOOPT-NEXT: s_waitcnt vmcnt(0)
1608- ; NOOPT-NEXT: v_readlane_b32 s0, v31, 6
1609- ; NOOPT-NEXT: v_readlane_b32 s1, v31, 7
16101603; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:72 ; 4-byte Folded Reload
1604+ ; NOOPT-NEXT: s_waitcnt expcnt(1)
16111605; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
16121606; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
16131607; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
@@ -1618,14 +1612,26 @@ define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
16181612; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload
16191613; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload
16201614; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload
1615+ ; NOOPT-NEXT: s_waitcnt expcnt(6)
16211616; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload
1617+ ; NOOPT-NEXT: s_waitcnt expcnt(5)
16221618; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload
1619+ ; NOOPT-NEXT: s_waitcnt expcnt(4)
16231620; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload
1621+ ; NOOPT-NEXT: s_waitcnt expcnt(3)
16241622; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload
1623+ ; NOOPT-NEXT: s_waitcnt expcnt(2)
16251624; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload
1625+ ; NOOPT-NEXT: s_waitcnt expcnt(1)
16261626; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
16271627; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
1628+ ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
1629+ ; NOOPT-NEXT: s_waitcnt expcnt(0)
1630+ ; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
1631+ ; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
16281632; NOOPT-NEXT: s_waitcnt vmcnt(0)
1633+ ; NOOPT-NEXT: v_readlane_b32 s0, v31, 6
1634+ ; NOOPT-NEXT: v_readlane_b32 s1, v31, 7
16291635; NOOPT-NEXT: v_readfirstlane_b32 s2, v16
16301636; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v16
16311637; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
@@ -4122,13 +4128,6 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
41224128; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
41234129; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
41244130; NOOPT-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
4125- ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
4126- ; NOOPT-NEXT: s_waitcnt expcnt(0)
4127- ; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
4128- ; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
4129- ; NOOPT-NEXT: s_waitcnt vmcnt(0)
4130- ; NOOPT-NEXT: v_readlane_b32 s0, v31, 6
4131- ; NOOPT-NEXT: v_readlane_b32 s1, v31, 7
41324131; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
41334132; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
41344133; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
@@ -4154,7 +4153,12 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
41544153; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
41554154; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
41564155; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload
4156+ ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
4157+ ; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
4158+ ; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
41574159; NOOPT-NEXT: s_waitcnt vmcnt(0)
4160+ ; NOOPT-NEXT: v_readlane_b32 s0, v31, 6
4161+ ; NOOPT-NEXT: v_readlane_b32 s1, v31, 7
41584162; NOOPT-NEXT: v_readfirstlane_b32 s2, v17
41594163; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17
41604164; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
@@ -4607,13 +4611,6 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p
46074611; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
46084612; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
46094613; NOOPT-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
4610- ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
4611- ; NOOPT-NEXT: s_waitcnt expcnt(0)
4612- ; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
4613- ; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
4614- ; NOOPT-NEXT: s_waitcnt vmcnt(0)
4615- ; NOOPT-NEXT: v_readlane_b32 s0, v31, 6
4616- ; NOOPT-NEXT: v_readlane_b32 s1, v31, 7
46174614; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
46184615; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
46194616; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
@@ -4639,7 +4636,12 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p
46394636; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
46404637; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
46414638; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload
4639+ ; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
4640+ ; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 ; 4-byte Folded Reload
4641+ ; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
46424642; NOOPT-NEXT: s_waitcnt vmcnt(0)
4643+ ; NOOPT-NEXT: v_readlane_b32 s0, v31, 6
4644+ ; NOOPT-NEXT: v_readlane_b32 s1, v31, 7
46434645; NOOPT-NEXT: v_readfirstlane_b32 s2, v17
46444646; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17
46454647; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
@@ -5161,14 +5163,8 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
51615163; NOOPT-NEXT: ; implicit-def: $vgpr0
51625164; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
51635165; NOOPT-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1
5164- ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
5165- ; NOOPT-NEXT: s_waitcnt expcnt(0)
5166- ; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
5167- ; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
5168- ; NOOPT-NEXT: s_waitcnt vmcnt(0)
5169- ; NOOPT-NEXT: v_readlane_b32 s0, v18, 23
5170- ; NOOPT-NEXT: v_readlane_b32 s1, v18, 24
51715166; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:80 ; 4-byte Folded Reload
5167+ ; NOOPT-NEXT: s_waitcnt expcnt(1)
51725168; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; 4-byte Folded Reload
51735169; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 ; 4-byte Folded Reload
51745170; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:12 ; 4-byte Folded Reload
@@ -5179,14 +5175,26 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
51795175; NOOPT-NEXT: buffer_load_dword v7, off, s[36:39], 0 offset:32 ; 4-byte Folded Reload
51805176; NOOPT-NEXT: buffer_load_dword v8, off, s[36:39], 0 offset:36 ; 4-byte Folded Reload
51815177; NOOPT-NEXT: buffer_load_dword v9, off, s[36:39], 0 offset:40 ; 4-byte Folded Reload
5178+ ; NOOPT-NEXT: s_waitcnt expcnt(6)
51825179; NOOPT-NEXT: buffer_load_dword v10, off, s[36:39], 0 offset:44 ; 4-byte Folded Reload
5180+ ; NOOPT-NEXT: s_waitcnt expcnt(5)
51835181; NOOPT-NEXT: buffer_load_dword v11, off, s[36:39], 0 offset:48 ; 4-byte Folded Reload
5182+ ; NOOPT-NEXT: s_waitcnt expcnt(4)
51845183; NOOPT-NEXT: buffer_load_dword v12, off, s[36:39], 0 offset:52 ; 4-byte Folded Reload
5184+ ; NOOPT-NEXT: s_waitcnt expcnt(3)
51855185; NOOPT-NEXT: buffer_load_dword v13, off, s[36:39], 0 offset:56 ; 4-byte Folded Reload
5186+ ; NOOPT-NEXT: s_waitcnt expcnt(2)
51865187; NOOPT-NEXT: buffer_load_dword v14, off, s[36:39], 0 offset:60 ; 4-byte Folded Reload
5188+ ; NOOPT-NEXT: s_waitcnt expcnt(1)
51875189; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:64 ; 4-byte Folded Reload
51885190; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:72 ; 4-byte Folded Reload
5191+ ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
5192+ ; NOOPT-NEXT: s_waitcnt expcnt(0)
5193+ ; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
5194+ ; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
51895195; NOOPT-NEXT: s_waitcnt vmcnt(0)
5196+ ; NOOPT-NEXT: v_readlane_b32 s0, v18, 23
5197+ ; NOOPT-NEXT: v_readlane_b32 s1, v18, 24
51905198; NOOPT-NEXT: v_readfirstlane_b32 s2, v16
51915199; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v16
51925200; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
@@ -5278,14 +5286,8 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
52785286; NOOPT-NEXT: ; implicit-def: $vgpr0
52795287; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
52805288; NOOPT-NEXT: .LBB16_4: ; =>This Inner Loop Header: Depth=1
5281- ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
5282- ; NOOPT-NEXT: s_waitcnt expcnt(0)
5283- ; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
5284- ; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
5285- ; NOOPT-NEXT: s_waitcnt vmcnt(0)
5286- ; NOOPT-NEXT: v_readlane_b32 s0, v18, 28
5287- ; NOOPT-NEXT: v_readlane_b32 s1, v18, 29
52885289; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:152 ; 4-byte Folded Reload
5290+ ; NOOPT-NEXT: s_waitcnt expcnt(1)
52895291; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:88 ; 4-byte Folded Reload
52905292; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:92 ; 4-byte Folded Reload
52915293; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:96 ; 4-byte Folded Reload
@@ -5296,14 +5298,26 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
52965298; NOOPT-NEXT: buffer_load_dword v7, off, s[36:39], 0 offset:116 ; 4-byte Folded Reload
52975299; NOOPT-NEXT: buffer_load_dword v8, off, s[36:39], 0 offset:120 ; 4-byte Folded Reload
52985300; NOOPT-NEXT: buffer_load_dword v9, off, s[36:39], 0 offset:124 ; 4-byte Folded Reload
5301+ ; NOOPT-NEXT: s_waitcnt expcnt(6)
52995302; NOOPT-NEXT: buffer_load_dword v10, off, s[36:39], 0 offset:128 ; 4-byte Folded Reload
5303+ ; NOOPT-NEXT: s_waitcnt expcnt(5)
53005304; NOOPT-NEXT: buffer_load_dword v11, off, s[36:39], 0 offset:132 ; 4-byte Folded Reload
5305+ ; NOOPT-NEXT: s_waitcnt expcnt(4)
53015306; NOOPT-NEXT: buffer_load_dword v12, off, s[36:39], 0 offset:136 ; 4-byte Folded Reload
5307+ ; NOOPT-NEXT: s_waitcnt expcnt(3)
53025308; NOOPT-NEXT: buffer_load_dword v13, off, s[36:39], 0 offset:140 ; 4-byte Folded Reload
5309+ ; NOOPT-NEXT: s_waitcnt expcnt(2)
53035310; NOOPT-NEXT: buffer_load_dword v14, off, s[36:39], 0 offset:144 ; 4-byte Folded Reload
5311+ ; NOOPT-NEXT: s_waitcnt expcnt(1)
53045312; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:148 ; 4-byte Folded Reload
53055313; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:68 ; 4-byte Folded Reload
5314+ ; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
5315+ ; NOOPT-NEXT: s_waitcnt expcnt(0)
5316+ ; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 ; 4-byte Folded Reload
5317+ ; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
53065318; NOOPT-NEXT: s_waitcnt vmcnt(0)
5319+ ; NOOPT-NEXT: v_readlane_b32 s0, v18, 28
5320+ ; NOOPT-NEXT: v_readlane_b32 s1, v18, 29
53075321; NOOPT-NEXT: v_readfirstlane_b32 s2, v16
53085322; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v16
53095323; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
@@ -5889,13 +5903,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
58895903; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
58905904; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
58915905; NOOPT-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
5892- ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
5893- ; NOOPT-NEXT: s_waitcnt expcnt(0)
5894- ; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
5895- ; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
5896- ; NOOPT-NEXT: s_waitcnt vmcnt(0)
5897- ; NOOPT-NEXT: v_readlane_b32 s0, v32, 7
5898- ; NOOPT-NEXT: v_readlane_b32 s1, v32, 8
58995906; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:4 ; 4-byte Folded Reload
59005907; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:8 ; 4-byte Folded Reload
59015908; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:12 ; 4-byte Folded Reload
@@ -5921,7 +5928,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
59215928; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
59225929; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload
59235930; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload
5931+ ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
5932+ ; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
5933+ ; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
59245934; NOOPT-NEXT: s_waitcnt vmcnt(0)
5935+ ; NOOPT-NEXT: v_readlane_b32 s0, v32, 7
5936+ ; NOOPT-NEXT: v_readlane_b32 s1, v32, 8
59255937; NOOPT-NEXT: v_readfirstlane_b32 s2, v17
59265938; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17
59275939; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
@@ -6023,13 +6035,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
60236035; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill
60246036; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
60256037; NOOPT-NEXT: .LBB17_4: ; =>This Inner Loop Header: Depth=1
6026- ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
6027- ; NOOPT-NEXT: s_waitcnt expcnt(0)
6028- ; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
6029- ; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
6030- ; NOOPT-NEXT: s_waitcnt vmcnt(0)
6031- ; NOOPT-NEXT: v_readlane_b32 s0, v32, 11
6032- ; NOOPT-NEXT: v_readlane_b32 s1, v32, 12
60336038; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Reload
60346039; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Reload
60356040; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:160 ; 4-byte Folded Reload
@@ -6055,7 +6060,12 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
60556060; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Reload
60566061; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:216 ; 4-byte Folded Reload
60576062; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:76 ; 4-byte Folded Reload
6063+ ; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
6064+ ; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 ; 4-byte Folded Reload
6065+ ; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
60586066; NOOPT-NEXT: s_waitcnt vmcnt(0)
6067+ ; NOOPT-NEXT: v_readlane_b32 s0, v32, 11
6068+ ; NOOPT-NEXT: v_readlane_b32 s1, v32, 12
60596069; NOOPT-NEXT: v_readfirstlane_b32 s2, v17
60606070; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17
60616071; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
@@ -9146,13 +9156,6 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
91469156; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
91479157; NOOPT-NEXT: .LBB26_3: ; Parent Loop BB26_1 Depth=1
91489158; NOOPT-NEXT: ; => This Inner Loop Header: Depth=2
9149- ; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
9150- ; NOOPT-NEXT: s_waitcnt expcnt(0)
9151- ; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
9152- ; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
9153- ; NOOPT-NEXT: s_waitcnt vmcnt(0)
9154- ; NOOPT-NEXT: v_readlane_b32 s0, v18, 6
9155- ; NOOPT-NEXT: v_readlane_b32 s1, v18, 7
91569159; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload
91579160; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload
91589161; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload
@@ -9178,7 +9181,12 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
91789181; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload
91799182; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload
91809183; NOOPT-NEXT: buffer_load_dword v17, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload
9184+ ; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
9185+ ; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 ; 4-byte Folded Reload
9186+ ; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
91819187; NOOPT-NEXT: s_waitcnt vmcnt(0)
9188+ ; NOOPT-NEXT: v_readlane_b32 s0, v18, 6
9189+ ; NOOPT-NEXT: v_readlane_b32 s1, v18, 7
91829190; NOOPT-NEXT: v_readfirstlane_b32 s2, v17
91839191; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17
91849192; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
@@ -9637,13 +9645,6 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
96379645; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill
96389646; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
96399647; NOOPT-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1
9640- ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1
9641- ; NOOPT-NEXT: s_waitcnt expcnt(0)
9642- ; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload
9643- ; NOOPT-NEXT: s_mov_b64 exec, s[12:13]
9644- ; NOOPT-NEXT: s_waitcnt vmcnt(0)
9645- ; NOOPT-NEXT: v_readlane_b32 s0, v33, 9
9646- ; NOOPT-NEXT: v_readlane_b32 s1, v33, 10
96479648; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
96489649; NOOPT-NEXT: buffer_load_dword v1, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
96499650; NOOPT-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
@@ -9669,7 +9670,12 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
96699670; NOOPT-NEXT: buffer_load_dword v15, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload
96709671; NOOPT-NEXT: buffer_load_dword v16, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload
96719672; NOOPT-NEXT: buffer_load_dword v17, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload
9673+ ; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1
9674+ ; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 ; 4-byte Folded Reload
9675+ ; NOOPT-NEXT: s_mov_b64 exec, s[12:13]
96729676; NOOPT-NEXT: s_waitcnt vmcnt(0)
9677+ ; NOOPT-NEXT: v_readlane_b32 s0, v33, 9
9678+ ; NOOPT-NEXT: v_readlane_b32 s1, v33, 10
96739679; NOOPT-NEXT: v_readfirstlane_b32 s2, v17
96749680; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17
96759681; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
0 commit comments