Skip to content

Commit c84df88

Browse files
committed
[LSV] Address feedback
- Correct function name in merge-vectors.ll - Tighten an assertion before computing delta - Simplify check for overlapping chains - Rebase
1 parent 9d55b17 commit c84df88

File tree

7 files changed

+47
-92
lines changed

7 files changed

+47
-92
lines changed

llvm/lib/Transforms/Utils/Local.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3184,9 +3184,6 @@ void llvm::copyMetadataForStore(StoreInst &Dest, const StoreInst &Source) {
31843184
if (NewType->isPointerTy())
31853185
Dest.setMetadata(ID, N);
31863186
break;
3187-
3188-
case LLVMContext::MD_range:
3189-
break;
31903187
}
31913188
}
31923189
}

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -507,9 +507,10 @@ Value *Vectorizer::insertCast(Value *Val, Type *DstTy) {
507507

508508
std::optional<APInt> Vectorizer::computeLeaderDelta(Instruction *I1,
509509
Instruction *I2) {
510-
assert((isa<LoadInst>(I1) || isa<StoreInst>(I1)) &&
511-
(isa<LoadInst>(I2) || isa<StoreInst>(I2)) &&
512-
"computeLeaderDelta must be called with load or store instructions");
510+
assert(((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
511+
(isa<StoreInst>(I1) && isa<StoreInst>(I2))) &&
512+
"computeLeaderDelta must be called with two load or two store "
513+
"instructions");
513514
Instruction *CtxInst = I1->comesBefore(I2) ? I2 : I1;
514515
const Value *Ptr1 = getLoadStorePointerOperand(I1);
515516
const Value *Ptr2 = getLoadStorePointerOperand(I2);
@@ -519,19 +520,15 @@ std::optional<APInt> Vectorizer::computeLeaderDelta(Instruction *I1,
519520

520521
bool Vectorizer::chainsOverlapAfterRebase(const Chain &A, const Chain &B,
521522
const APInt &Delta) const {
522-
for (const ChainElem &EB : B) {
523-
APInt OffB = EB.OffsetFromLeader + Delta;
524-
unsigned SizeB = DL.getTypeStoreSize(getLoadStoreType(EB.Inst));
525-
ConstantRange BRange(OffB, OffB + SizeB);
526-
for (const ChainElem &EA : A) {
527-
APInt OffA = EA.OffsetFromLeader;
528-
unsigned SizeA = DL.getTypeStoreSize(getLoadStoreType(EA.Inst));
529-
ConstantRange ARange(OffA, OffA + SizeA);
530-
if (!ARange.intersectWith(BRange).isEmptySet())
531-
return true;
532-
}
533-
}
534-
return false;
523+
ConstantRange ARange(
524+
A.front().OffsetFromLeader,
525+
A.back().OffsetFromLeader +
526+
DL.getTypeStoreSize(getLoadStoreType(A.back().Inst)));
527+
ConstantRange BRange(
528+
B.front().OffsetFromLeader + Delta,
529+
B.back().OffsetFromLeader + Delta +
530+
DL.getTypeStoreSize(getLoadStoreType(B.back().Inst)));
531+
return !ARange.intersectWith(BRange).isEmptySet();
535532
}
536533

537534
void Vectorizer::rebaseChain(Chain &C, const APInt &Delta) {

llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -518,14 +518,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
518518
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
519519
; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s1
520520
; GFX908-NEXT: s_sub_i32 s3, 0, s1
521-
; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s2
522-
; GFX908-NEXT: v_mov_b32_e32 v19, 0
523-
; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0
524-
; GFX908-NEXT: v_mov_b32_e32 v0, 0
525-
; GFX908-NEXT: v_mov_b32_e32 v1, 0
526-
; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
527-
; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2
528-
; GFX908-NEXT: v_readfirstlane_b32 s5, v2
521+
; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s2
522+
; GFX908-NEXT: v_mov_b32_e32 v17, 0
523+
; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0
524+
; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
525+
; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0
526+
; GFX908-NEXT: v_readfirstlane_b32 s5, v0
529527
; GFX908-NEXT: s_mul_i32 s3, s3, s5
530528
; GFX908-NEXT: s_mul_hi_u32 s3, s5, s3
531529
; GFX908-NEXT: s_add_i32 s5, s5, s3
@@ -541,12 +539,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
541539
; GFX908-NEXT: s_cmp_ge_u32 s0, s1
542540
; GFX908-NEXT: s_cselect_b32 s8, s5, s3
543541
; GFX908-NEXT: s_lshr_b32 s2, s2, 16
544-
; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s2
542+
; GFX908-NEXT: v_cvt_f32_f16_e32 v19, s2
545543
; GFX908-NEXT: s_lshl_b64 s[12:13], s[6:7], 5
544+
; GFX908-NEXT: v_mov_b32_e32 v0, 0
546545
; GFX908-NEXT: s_lshl_b64 s[10:11], s[16:17], 5
547546
; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1]
548547
; GFX908-NEXT: s_or_b32 s12, s12, 28
549548
; GFX908-NEXT: s_lshl_b64 s[14:15], s[8:9], 5
549+
; GFX908-NEXT: v_mov_b32_e32 v1, 0
550550
; GFX908-NEXT: s_waitcnt vmcnt(0)
551551
; GFX908-NEXT: v_readfirstlane_b32 s2, v16
552552
; GFX908-NEXT: s_and_b32 s2, 0xffff, s2
@@ -610,7 +610,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
610610
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
611611
; GFX908-NEXT: s_add_u32 s22, s20, s5
612612
; GFX908-NEXT: s_addc_u32 s23, s21, s9
613-
; GFX908-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc
613+
; GFX908-NEXT: global_load_dword v21, v17, s[22:23] offset:-12 glc
614614
; GFX908-NEXT: s_waitcnt vmcnt(0)
615615
; GFX908-NEXT: global_load_dword v20, v17, s[22:23] offset:-8 glc
616616
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -685,12 +685,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
685685
; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1
686686
; GFX90A-NEXT: s_sub_i32 s3, 0, s1
687687
; GFX90A-NEXT: v_mov_b32_e32 v19, 0
688-
; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0
689-
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], 0, 0
690-
; GFX90A-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
691-
; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v2
692-
; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s2
693-
; GFX90A-NEXT: v_readfirstlane_b32 s5, v3
688+
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0
689+
; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0
690+
; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
691+
; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v0
692+
; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, s2
693+
; GFX90A-NEXT: v_readfirstlane_b32 s5, v1
694694
; GFX90A-NEXT: s_mul_i32 s3, s3, s5
695695
; GFX90A-NEXT: s_mul_hi_u32 s3, s5, s3
696696
; GFX90A-NEXT: s_add_i32 s5, s5, s3
@@ -706,7 +706,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
706706
; GFX90A-NEXT: s_cmp_ge_u32 s0, s1
707707
; GFX90A-NEXT: s_cselect_b32 s8, s5, s3
708708
; GFX90A-NEXT: s_lshr_b32 s2, s2, 16
709-
; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s2
709+
; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, s2
710710
; GFX90A-NEXT: s_lshl_b64 s[12:13], s[6:7], 5
711711
; GFX90A-NEXT: s_lshl_b64 s[10:11], s[16:17], 5
712712
; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1]
@@ -733,7 +733,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
733733
; GFX90A-NEXT: s_cbranch_vccz .LBB3_10
734734
; GFX90A-NEXT: ; %bb.3: ; %bb14
735735
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
736-
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
736+
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
737737
; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
738738
; GFX90A-NEXT: s_mov_b32 s5, s4
739739
; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3]

llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -119,15 +119,7 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
119119
; CHECK-NEXT: s_branch .LBB0_5
120120
; CHECK-NEXT: .LBB0_9: ; in Loop: Header=BB0_2 Depth=1
121121
; CHECK-NEXT: s_mov_b64 s[22:23], 0
122-
<<<<<<< HEAD
123-
<<<<<<< HEAD
124122
; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[10:11]
125-
=======
126-
; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[12:13]
127-
>>>>>>> 5cef8c6c2c95 ([LSV] Merge contiguous chains across scalar types)
128-
=======
129-
; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[10:11]
130-
>>>>>>> 153ff02a7582 ([LSV] Check for power-of-two width)
131123
; CHECK-NEXT: s_mov_b64 s[8:9], s[20:21]
132124
; CHECK-NEXT: s_branch .LBB0_15
133125
; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_2 Depth=1
@@ -144,59 +136,28 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
144136
; CHECK-NEXT: v_cndmask_b32_e64 v23, v23, 0, s[16:17]
145137
; CHECK-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[16:17]
146138
; CHECK-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[8:9]
147-
<<<<<<< HEAD
148139
; CHECK-NEXT: v_mov_b32_e32 v17, v16
149140
; CHECK-NEXT: s_and_b64 s[8:9], exec, s[16:17]
150141
; CHECK-NEXT: global_store_dwordx2 v20, v[16:17], s[12:13]
151-
=======
152-
; CHECK-NEXT: s_and_b64 s[8:9], exec, s[16:17]
153-
<<<<<<< HEAD
154-
; CHECK-NEXT: global_store_dwordx2 v20, v[16:17], s[14:15]
155-
>>>>>>> 5cef8c6c2c95 ([LSV] Merge contiguous chains across scalar types)
156-
=======
157-
; CHECK-NEXT: global_store_dwordx2 v20, v[16:17], s[12:13]
158-
>>>>>>> 153ff02a7582 ([LSV] Check for power-of-two width)
159142
; CHECK-NEXT: s_cselect_b32 s23, s23, 0
160143
; CHECK-NEXT: s_cselect_b32 s22, s22, 0
161144
; CHECK-NEXT: s_mov_b64 s[8:9], -1
162145
; CHECK-NEXT: s_branch .LBB0_14
163146
; CHECK-NEXT: .LBB0_13: ; in Loop: Header=BB0_2 Depth=1
164147
; CHECK-NEXT: s_mov_b64 s[8:9], 0
165148
; CHECK-NEXT: v_mov_b64_e32 v[22:23], 0
166-
<<<<<<< HEAD
167-
<<<<<<< HEAD
168-
; CHECK-NEXT: .LBB0_14: ; %Flow6
169-
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
170-
; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[24:25]
171-
; CHECK-NEXT: .LBB0_15: ; %Flow6
172-
=======
173-
; CHECK-NEXT: .LBB0_14: ; %Flow7
174-
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
175-
; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[24:25]
176-
; CHECK-NEXT: .LBB0_15: ; %Flow7
177-
>>>>>>> 5cef8c6c2c95 ([LSV] Merge contiguous chains across scalar types)
178-
=======
179149
; CHECK-NEXT: .LBB0_14: ; %Flow6
180150
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
181151
; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[24:25]
182152
; CHECK-NEXT: .LBB0_15: ; %Flow6
183-
>>>>>>> 153ff02a7582 ([LSV] Check for power-of-two width)
184153
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
185154
; CHECK-NEXT: s_mov_b64 s[24:25], -1
186155
; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9]
187156
; CHECK-NEXT: s_cbranch_vccz .LBB0_1
188157
; CHECK-NEXT: ; %bb.16: ; %._crit_edge2105.i.i.i2330
189158
; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
190159
; CHECK-NEXT: s_mov_b64 s[24:25], 0
191-
<<<<<<< HEAD
192-
<<<<<<< HEAD
193-
; CHECK-NEXT: global_store_dwordx2 v20, v[20:21], s[12:13]
194-
=======
195-
; CHECK-NEXT: global_store_dwordx2 v20, v[20:21], s[14:15]
196-
>>>>>>> 5cef8c6c2c95 ([LSV] Merge contiguous chains across scalar types)
197-
=======
198160
; CHECK-NEXT: global_store_dwordx2 v20, v[20:21], s[12:13]
199-
>>>>>>> 153ff02a7582 ([LSV] Check for power-of-two width)
200161
; CHECK-NEXT: s_branch .LBB0_1
201162
; CHECK-NEXT: .LBB0_17: ; %DummyReturnBlock
202163
; CHECK-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/mad_64_32.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1117,19 +1117,19 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
11171117
;
11181118
; GFX1250-LABEL: mad_i64_i32_uniform:
11191119
; GFX1250: ; %bb.0:
1120+
; GFX1250-NEXT: s_clause 0x1
11201121
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
11211122
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
11221123
; GFX1250-NEXT: s_wait_xcnt 0x0
1123-
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1124-
; GFX1250-NEXT: s_mov_b32 s7, 0
1124+
; GFX1250-NEXT: s_mov_b32 s5, 0
11251125
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
11261126
; GFX1250-NEXT: s_wait_kmcnt 0x0
1127-
; GFX1250-NEXT: s_mov_b32 s6, s2
1127+
; GFX1250-NEXT: s_mov_b32 s4, s2
11281128
; GFX1250-NEXT: s_mov_b32 s2, s3
1129-
; GFX1250-NEXT: s_mov_b32 s3, s7
1129+
; GFX1250-NEXT: s_mov_b32 s3, s5
11301130
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1131-
; GFX1250-NEXT: s_mul_u64 s[2:3], s[6:7], s[2:3]
1132-
; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
1131+
; GFX1250-NEXT: s_mul_u64 s[2:3], s[4:5], s[2:3]
1132+
; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[6:7]
11331133
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
11341134
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
11351135
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]

llvm/test/CodeGen/AMDGPU/min.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,13 +1172,13 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
11721172
;
11731173
; GFX1250-LABEL: s_test_imin_sle_v4i16:
11741174
; GFX1250: ; %bb.0:
1175+
; GFX1250-NEXT: s_clause 0x1
11751176
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1176-
; GFX1250-NEXT: s_wait_xcnt 0x0
1177-
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
1177+
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
11781178
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
11791179
; GFX1250-NEXT: s_wait_kmcnt 0x0
1180-
; GFX1250-NEXT: v_pk_min_i16 v1, s3, s5
1181-
; GFX1250-NEXT: v_pk_min_i16 v0, s2, s4
1180+
; GFX1250-NEXT: v_pk_min_i16 v1, s3, s7
1181+
; GFX1250-NEXT: v_pk_min_i16 v0, s2, s6
11821182
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
11831183
; GFX1250-NEXT: s_endpgm
11841184
%cmp = icmp sle <4 x i16> %a, %b
@@ -1686,13 +1686,13 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
16861686
;
16871687
; GFX1250-LABEL: s_test_imin_slt_v2i32:
16881688
; GFX1250: ; %bb.0:
1689+
; GFX1250-NEXT: s_clause 0x1
16891690
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1690-
; GFX1250-NEXT: s_wait_xcnt 0x0
1691-
; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
1691+
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
16921692
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
16931693
; GFX1250-NEXT: s_wait_kmcnt 0x0
1694-
; GFX1250-NEXT: s_min_i32 s2, s2, s4
1695-
; GFX1250-NEXT: s_min_i32 s3, s3, s5
1694+
; GFX1250-NEXT: s_min_i32 s2, s2, s6
1695+
; GFX1250-NEXT: s_min_i32 s3, s3, s7
16961696
; GFX1250-NEXT: v_mov_b32_e32 v0, s2
16971697
; GFX1250-NEXT: v_mov_b32_e32 v1, s3
16981698
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]

llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,8 @@ entry:
149149
ret void
150150
}
151151

152-
define amdgpu_kernel void @merge_load_i32_v2i64(ptr addrspace(1) nocapture %a) #0 {
153-
; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i64(
152+
define amdgpu_kernel void @no_merge_load_i32_v2i8(ptr addrspace(1) nocapture %a) #0 {
153+
; CHECK-LABEL: define amdgpu_kernel void @no_merge_load_i32_v2i8(
154154
; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] {
155155
; CHECK-NEXT: [[ENTRY:.*:]]
156156
; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i32 1

0 commit comments

Comments
 (0)