[LSV] Address feedback

gandhi56 · gandhi56 · commit c84df8896cb5 · 2025-10-03T10:47:54.000-05:00
- Correct function name in merge-vectors.ll
- Tighten an assertion before computing delta
- Simplify check for overlapping chains
- Rebase
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3184,9 +3184,6 @@ void llvm::copyMetadataForStore(StoreInst &Dest, const StoreInst &Source) {
       if (NewType->isPointerTy())
         Dest.setMetadata(ID, N);
       break;
-
-    case LLVMContext::MD_range:
-      break;
     }
   }
 }
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -507,9 +507,10 @@ Value *Vectorizer::insertCast(Value *Val, Type *DstTy) {
 
 std::optional<APInt> Vectorizer::computeLeaderDelta(Instruction *I1,
                                                     Instruction *I2) {
-  assert((isa<LoadInst>(I1) || isa<StoreInst>(I1)) &&
-         (isa<LoadInst>(I2) || isa<StoreInst>(I2)) &&
-         "computeLeaderDelta must be called with load or store instructions");
+  assert(((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
+          (isa<StoreInst>(I1) && isa<StoreInst>(I2))) &&
+         "computeLeaderDelta must be called with two load or two store "
+         "instructions");
   Instruction *CtxInst = I1->comesBefore(I2) ? I2 : I1;
   const Value *Ptr1 = getLoadStorePointerOperand(I1);
   const Value *Ptr2 = getLoadStorePointerOperand(I2);
@@ -519,19 +520,15 @@ std::optional<APInt> Vectorizer::computeLeaderDelta(Instruction *I1,
 
 bool Vectorizer::chainsOverlapAfterRebase(const Chain &A, const Chain &B,
                                           const APInt &Delta) const {
-  for (const ChainElem &EB : B) {
-    APInt OffB = EB.OffsetFromLeader + Delta;
-    unsigned SizeB = DL.getTypeStoreSize(getLoadStoreType(EB.Inst));
-    ConstantRange BRange(OffB, OffB + SizeB);
-    for (const ChainElem &EA : A) {
-      APInt OffA = EA.OffsetFromLeader;
-      unsigned SizeA = DL.getTypeStoreSize(getLoadStoreType(EA.Inst));
-      ConstantRange ARange(OffA, OffA + SizeA);
-      if (!ARange.intersectWith(BRange).isEmptySet())
-        return true;
-    }
-  }
-  return false;
+  ConstantRange ARange(
+      A.front().OffsetFromLeader,
+      A.back().OffsetFromLeader +
+          DL.getTypeStoreSize(getLoadStoreType(A.back().Inst)));
+  ConstantRange BRange(
+      B.front().OffsetFromLeader + Delta,
+      B.back().OffsetFromLeader + Delta +
+          DL.getTypeStoreSize(getLoadStoreType(B.back().Inst)));
+  return !ARange.intersectWith(BRange).isEmptySet();
 }
 
 void Vectorizer::rebaseChain(Chain &C, const APInt &Delta) {
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -518,14 +518,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_cvt_f32_u32_e32 v0, s1
 ; GFX908-NEXT:    s_sub_i32 s3, 0, s1
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v17, s2
-; GFX908-NEXT:    v_mov_b32_e32 v19, 0
-; GFX908-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX908-NEXT:    v_mov_b32_e32 v0, 0
-; GFX908-NEXT:    v_mov_b32_e32 v1, 0
-; GFX908-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX908-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v18, s2
+; GFX908-NEXT:    v_mov_b32_e32 v17, 0
+; GFX908-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX908-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX908-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX908-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX908-NEXT:    s_mul_i32 s3, s3, s5
 ; GFX908-NEXT:    s_mul_hi_u32 s3, s5, s3
 ; GFX908-NEXT:    s_add_i32 s5, s5, s3
@@ -541,12 +539,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_cmp_ge_u32 s0, s1
 ; GFX908-NEXT:    s_cselect_b32 s8, s5, s3
 ; GFX908-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v18, s2
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v19, s2
 ; GFX908-NEXT:    s_lshl_b64 s[12:13], s[6:7], 5
+; GFX908-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX908-NEXT:    s_lshl_b64 s[10:11], s[16:17], 5
 ; GFX908-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
 ; GFX908-NEXT:    s_or_b32 s12, s12, 28
 ; GFX908-NEXT:    s_lshl_b64 s[14:15], s[8:9], 5
+; GFX908-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readfirstlane_b32 s2, v16
 ; GFX908-NEXT:    s_and_b32 s2, 0xffff, s2
@@ -610,7 +610,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    s_add_u32 s22, s20, s5
 ; GFX908-NEXT:    s_addc_u32 s23, s21, s9
-; GFX908-NEXT:    global_load_dword v21, v19, s[22:23] offset:-12 glc
+; GFX908-NEXT:    global_load_dword v21, v17, s[22:23] offset:-12 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    global_load_dword v20, v17, s[22:23] offset:-8 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -685,12 +685,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s1
 ; GFX90A-NEXT:    s_sub_i32 s3, 0, s1
 ; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GFX90A-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v2
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, s2
-; GFX90A-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
+; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v0
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; GFX90A-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX90A-NEXT:    s_mul_i32 s3, s3, s5
 ; GFX90A-NEXT:    s_mul_hi_u32 s3, s5, s3
 ; GFX90A-NEXT:    s_add_i32 s5, s5, s3
@@ -706,7 +706,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_cmp_ge_u32 s0, s1
 ; GFX90A-NEXT:    s_cselect_b32 s8, s5, s3
 ; GFX90A-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v3, s2
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v1, s2
 ; GFX90A-NEXT:    s_lshl_b64 s[12:13], s[6:7], 5
 ; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[16:17], 5
 ; GFX90A-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
@@ -733,7 +733,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_10
 ; GFX90A-NEXT:  ; %bb.3: ; %bb14
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
 ; GFX90A-NEXT:    s_mov_b32 s5, s4
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
@@ -119,15 +119,7 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    s_branch .LBB0_5
 ; CHECK-NEXT:  .LBB0_9: ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[22:23], 0
-<<<<<<< HEAD
-<<<<<<< HEAD
 ; CHECK-NEXT:    v_mov_b64_e32 v[30:31], s[10:11]
-=======
-; CHECK-NEXT:    v_mov_b64_e32 v[30:31], s[12:13]
->>>>>>> 5cef8c6c2c95 ([LSV] Merge contiguous chains across scalar types)
-=======
-; CHECK-NEXT:    v_mov_b64_e32 v[30:31], s[10:11]
->>>>>>> 153ff02a7582 ([LSV] Check for power-of-two width)
 ; CHECK-NEXT:    s_mov_b64 s[8:9], s[20:21]
 ; CHECK-NEXT:    s_branch .LBB0_15
 ; CHECK-NEXT:  .LBB0_10: ; in Loop: Header=BB0_2 Depth=1
@@ -144,59 +136,28 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2,
 ; CHECK-NEXT:    v_cndmask_b32_e64 v23, v23, 0, s[16:17]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v22, v22, 0, s[16:17]
 ; CHECK-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[8:9]
-<<<<<<< HEAD
 ; CHECK-NEXT:    v_mov_b32_e32 v17, v16
 ; CHECK-NEXT:    s_and_b64 s[8:9], exec, s[16:17]
 ; CHECK-NEXT:    global_store_dwordx2 v20, v[16:17], s[12:13]
-=======
-; CHECK-NEXT:    s_and_b64 s[8:9], exec, s[16:17]
-<<<<<<< HEAD
-; CHECK-NEXT:    global_store_dwordx2 v20, v[16:17], s[14:15]
->>>>>>> 5cef8c6c2c95 ([LSV] Merge contiguous chains across scalar types)
-=======
-; CHECK-NEXT:    global_store_dwordx2 v20, v[16:17], s[12:13]
->>>>>>> 153ff02a7582 ([LSV] Check for power-of-two width)
 ; CHECK-NEXT:    s_cselect_b32 s23, s23, 0
 ; CHECK-NEXT:    s_cselect_b32 s22, s22, 0
 ; CHECK-NEXT:    s_mov_b64 s[8:9], -1
 ; CHECK-NEXT:    s_branch .LBB0_14
 ; CHECK-NEXT:  .LBB0_13: ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[8:9], 0
 ; CHECK-NEXT:    v_mov_b64_e32 v[22:23], 0
-<<<<<<< HEAD
-<<<<<<< HEAD
-; CHECK-NEXT:  .LBB0_14: ; %Flow6
-; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    v_mov_b64_e32 v[30:31], v[24:25]
-; CHECK-NEXT:  .LBB0_15: ; %Flow6
-=======
-; CHECK-NEXT:  .LBB0_14: ; %Flow7
-; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    v_mov_b64_e32 v[30:31], v[24:25]
-; CHECK-NEXT:  .LBB0_15: ; %Flow7
->>>>>>> 5cef8c6c2c95 ([LSV] Merge contiguous chains across scalar types)
-=======
 ; CHECK-NEXT:  .LBB0_14: ; %Flow6
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    v_mov_b64_e32 v[30:31], v[24:25]
 ; CHECK-NEXT:  .LBB0_15: ; %Flow6
->>>>>>> 153ff02a7582 ([LSV] Check for power-of-two width)
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[24:25], -1
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_1
 ; CHECK-NEXT:  ; %bb.16: ; %._crit_edge2105.i.i.i2330
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[24:25], 0
-<<<<<<< HEAD
-<<<<<<< HEAD
-; CHECK-NEXT:    global_store_dwordx2 v20, v[20:21], s[12:13]
-=======
-; CHECK-NEXT:    global_store_dwordx2 v20, v[20:21], s[14:15]
->>>>>>> 5cef8c6c2c95 ([LSV] Merge contiguous chains across scalar types)
-=======
 ; CHECK-NEXT:    global_store_dwordx2 v20, v[20:21], s[12:13]
->>>>>>> 153ff02a7582 ([LSV] Check for power-of-two width)
 ; CHECK-NEXT:    s_branch .LBB0_1
 ; CHECK-NEXT:  .LBB0_17: ; %DummyReturnBlock
 ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1117,19 +1117,19 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 ;
 ; GFX1250-LABEL: mad_i64_i32_uniform:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1250-NEXT:    s_mov_b32 s7, 0
+; GFX1250-NEXT:    s_mov_b32 s5, 0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s6, s2
+; GFX1250-NEXT:    s_mov_b32 s4, s2
 ; GFX1250-NEXT:    s_mov_b32 s2, s3
-; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s3, s5
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_mul_u64 s[2:3], s[6:7], s[2:3]
-; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT:    s_mul_u64 s[2:3], s[4:5], s[2:3]
+; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[6:7]
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -1172,13 +1172,13 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ;
 ; GFX1250-LABEL: s_test_imin_sle_v4i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_pk_min_i16 v1, s3, s5
-; GFX1250-NEXT:    v_pk_min_i16 v0, s2, s4
+; GFX1250-NEXT:    v_pk_min_i16 v1, s3, s7
+; GFX1250-NEXT:    v_pk_min_i16 v0, s2, s6
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
   %cmp = icmp sle <4 x i16> %a, %b
@@ -1686,13 +1686,13 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
 ;
 ; GFX1250-LABEL: s_test_imin_slt_v2i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_min_i32 s2, s2, s4
-; GFX1250-NEXT:    s_min_i32 s3, s3, s5
+; GFX1250-NEXT:    s_min_i32 s2, s2, s6
+; GFX1250-NEXT:    s_min_i32 s3, s3, s7
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -149,8 +149,8 @@ entry:
   ret void
 }
 
-define amdgpu_kernel void @merge_load_i32_v2i64(ptr addrspace(1) nocapture %a) #0 {
-; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i64(
+define amdgpu_kernel void @no_merge_load_i32_v2i8(ptr addrspace(1) nocapture %a) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @no_merge_load_i32_v2i8(
 ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[A_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i32 1

Original file line number	Diff line number	Diff line change
`@@ -3184,9 +3184,6 @@ void llvm::copyMetadataForStore(StoreInst &Dest, const StoreInst &Source) {`
`3184`	`3184`	`if (NewType->isPointerTy())`
`3185`	`3185`	`Dest.setMetadata(ID, N);`
`3186`	`3186`	`break;`
`3187`		`-`
`3188`		`- case LLVMContext::MD_range:`
`3189`		`- break;`
`3190`	`3187`	`}`
`3191`	`3188`	`}`
`3192`	`3189`	`}`