-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[LV] Disable MinBW when the entire chain are cast/load instructions. #117330
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV] Disable MinBW when the entire chain are cast/load instructions. #117330
Conversation
If all instructions in the MinBW chain are cast/load instructions. We cannot get any benefit when narrowing the type since no other users use it. And this also cause the legacy cost model compute the wrong cost when the MinBW analysis indicate that the type can be narrow but not transformed in the following vplan optimizations.
|
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-llvm-transforms Author: Elvis Wang (ElvisWang123) ChangesIf all instructions in the MinBW chain are cast/load instructions. We cannot get any benefit when narrowing the type since no other users use it. And this also cause the legacy cost model compute the wrong cost when the MinBW analysis indicate that the type can be narrow but not transformed in the following vplan optimizations. This patch also fix: #115744, which the MinBW found the Full diff: https://github.com/llvm/llvm-project/pull/117330.diff 2 Files Affected:
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 1789671276ffaf..6102e54586b4c7 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -742,6 +742,13 @@ llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
Abort = true;
break;
}
+
+ // If all of instructions in the chain are load and cast instructions, we
+ // cannot get any benefit from MinBW.
+ if (all_of(llvm::make_range(ECs.member_begin(I), ECs.member_end()),
+ [](Value *M) { return isa<CastInst, LoadInst>(M); }))
+ Abort = true;
+
if (Abort)
continue;
diff --git a/llvm/test/Transforms/LoopVectorize/trunc-cast.ll b/llvm/test/Transforms/LoopVectorize/trunc-cast.ll
new file mode 100644
index 00000000000000..b76561d5584eb3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/trunc-cast.ll
@@ -0,0 +1,162 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt --passes=loop-vectorize -S %s -mtriple riscv64 -mattr=+v | FileCheck %s
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+
+define void @h(ptr %i, ptr %k, i64 %idxprom.us) #0 {
+; CHECK-LABEL: define void @h(
+; CHECK-SAME: ptr [[I:%.*]], ptr [[K:%.*]], i64 [[IDXPROM_US:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 2
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 1073741824, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 2
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 1073741824, [[TMP3]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 [[N_MOD_VF]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 1073741824, [[TMP5]]
+; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; CHECK-NEXT: [[TMP9:%.*]] = add <vscale x 2 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 2 x i32> [[TMP9]], splat (i32 4)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i32> zeroinitializer, [[TMP10]]
+; CHECK-NEXT: [[TMP11:%.*]] = mul i32 4, [[TMP7]]
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP11]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[DOTSPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 2
+; CHECK-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], 1
+; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 2 x i8> poison, i8 0, i32 [[TMP14]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[I]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 2 x i8> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP15:%.*]] = zext <vscale x 2 x i32> [[VEC_IND]] to <vscale x 2 x i64>
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr [2 x i16], ptr [[K]], <vscale x 2 x i64> [[TMP15]], i64 [[IDXPROM_US]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0(<vscale x 2 x ptr> [[TMP16]], i32 2, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i16> poison)
+; CHECK-NEXT: [[TMP17:%.*]] = icmp eq <vscale x 2 x i16> [[WIDE_MASKED_GATHER]], zeroinitializer
+; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i64> poison)
+; CHECK-NEXT: [[TMP18:%.*]] = shl <vscale x 2 x i64> zeroinitializer, [[WIDE_MASKED_GATHER1]]
+; CHECK-NEXT: [[TMP19:%.*]] = icmp eq <vscale x 2 x i64> [[TMP18]], zeroinitializer
+; CHECK-NEXT: [[TMP20:%.*]] = xor <vscale x 2 x i1> [[TMP19]], splat (i1 true)
+; CHECK-NEXT: [[TMP21:%.*]] = select <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[TMP20]], <vscale x 2 x i1> zeroinitializer
+; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP21]], <vscale x 2 x i64> poison)
+; CHECK-NEXT: [[TMP22:%.*]] = trunc <vscale x 2 x i64> [[WIDE_MASKED_GATHER2]] to <vscale x 2 x i32>
+; CHECK-NEXT: [[TMP23:%.*]] = xor <vscale x 2 x i1> [[TMP17]], splat (i1 true)
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP21]], <vscale x 2 x i32> [[TMP22]], <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[PREDPHI3:%.*]] = select <vscale x 2 x i1> [[TMP23]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> [[PREDPHI]]
+; CHECK-NEXT: [[TMP24:%.*]] = trunc <vscale x 2 x i32> [[PREDPHI3]] to <vscale x 2 x i8>
+; CHECK-NEXT: [[TMP25]] = mul <vscale x 2 x i8> zeroinitializer, [[TMP24]]
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr [2 x i64], ptr [[I]], i64 0, <vscale x 2 x i64> [[TMP15]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP26]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
+; CHECK-NEXT: [[TMP27:%.*]] = trunc <vscale x 2 x i64> [[WIDE_MASKED_GATHER4]] to <vscale x 2 x i16>
+; CHECK-NEXT: [[TMP28:%.*]] = xor <vscale x 2 x i16> [[TMP27]], zeroinitializer
+; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 2
+; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <vscale x 2 x i16> [[TMP28]], i32 [[TMP31]]
+; CHECK-NEXT: store i16 [[TMP32]], ptr null, align 2
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP7]]
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT: [[TMP33:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP35:%.*]] = mul i32 [[TMP34]], 2
+; CHECK-NEXT: [[TMP36:%.*]] = sub i32 [[TMP35]], 1
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 2 x i8> [[TMP25]], i32 [[TMP36]]
+; CHECK-NEXT: br label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[FOR_BODY_US:.*]]
+; CHECK: [[FOR_BODY_US]]:
+; CHECK-NEXT: [[L_046_US:%.*]] = phi i32 [ [[ADD_US:%.*]], %[[COND_END23_US:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[CONV284345_US:%.*]] = phi i8 [ [[CONV28_US:%.*]], %[[COND_END23_US]] ], [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[IDXPROM_US1:%.*]] = zext i32 [[L_046_US]] to i64
+; CHECK-NEXT: [[ARRAYIDX3_US:%.*]] = getelementptr [2 x i16], ptr [[K]], i64 [[IDXPROM_US1]], i64 [[IDXPROM_US]]
+; CHECK-NEXT: [[TMP37:%.*]] = load i16, ptr [[ARRAYIDX3_US]], align 2
+; CHECK-NEXT: [[TOBOOL4_NOT_US:%.*]] = icmp eq i16 [[TMP37]], 0
+; CHECK-NEXT: br i1 [[TOBOOL4_NOT_US]], label %[[COND_FALSE7_US:.*]], label %[[COND_END23_US]]
+; CHECK: [[COND_FALSE7_US]]:
+; CHECK-NEXT: [[TMP38:%.*]] = load i64, ptr [[I]], align 8
+; CHECK-NEXT: [[SHL_US:%.*]] = shl i64 0, [[TMP38]]
+; CHECK-NEXT: [[TOBOOL12_NOT_US:%.*]] = icmp eq i64 [[SHL_US]], 0
+; CHECK-NEXT: br i1 [[TOBOOL12_NOT_US]], label %[[COND_END23_US]], label %[[COND_TRUE14_US:.*]]
+; CHECK: [[COND_TRUE14_US]]:
+; CHECK-NEXT: [[TMP39:%.*]] = load i64, ptr [[I]], align 8
+; CHECK-NEXT: [[TMP40:%.*]] = trunc i64 [[TMP39]] to i32
+; CHECK-NEXT: br label %[[COND_END23_US]]
+; CHECK: [[COND_END23_US]]:
+; CHECK-NEXT: [[COND24_US:%.*]] = phi i32 [ [[TMP40]], %[[COND_TRUE14_US]] ], [ 0, %[[FOR_BODY_US]] ], [ 0, %[[COND_FALSE7_US]] ]
+; CHECK-NEXT: [[TMP41:%.*]] = trunc i32 [[COND24_US]] to i8
+; CHECK-NEXT: [[CONV28_US]] = mul i8 0, [[TMP41]]
+; CHECK-NEXT: [[ARRAYIDX31_US:%.*]] = getelementptr [2 x i64], ptr [[I]], i64 0, i64 [[IDXPROM_US1]]
+; CHECK-NEXT: [[TMP42:%.*]] = load i64, ptr [[ARRAYIDX31_US]], align 8
+; CHECK-NEXT: [[TMP43:%.*]] = trunc i64 [[TMP42]] to i16
+; CHECK-NEXT: [[CONV32_US:%.*]] = xor i16 [[TMP43]], 0
+; CHECK-NEXT: store i16 [[CONV32_US]], ptr null, align 2
+; CHECK-NEXT: [[ADD_US]] = add i32 [[L_046_US]], 4
+; CHECK-NEXT: [[TOBOOL_NOT_US:%.*]] = icmp eq i32 [[ADD_US]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT_US]], label %[[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:.*]], label %[[FOR_BODY_US]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]]:
+; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i8 [ [[TMP41]], %[[COND_END23_US]] ]
+; CHECK-NEXT: store i8 [[DOTLCSSA]], ptr null, align 1
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body.us
+
+for.body.us: ; preds = %cond.end23.us, %entry
+ %l.046.us = phi i32 [ %add.us, %cond.end23.us ], [ 0, %entry ]
+ %conv284345.us = phi i8 [ %conv28.us, %cond.end23.us ], [ 0, %entry ]
+ %idxprom.us1 = zext i32 %l.046.us to i64
+ %arrayidx3.us = getelementptr [2 x i16], ptr %k, i64 %idxprom.us1, i64 %idxprom.us
+ %0 = load i16, ptr %arrayidx3.us, align 2
+ %tobool4.not.us = icmp eq i16 %0, 0
+ br i1 %tobool4.not.us, label %cond.false7.us, label %cond.end23.us
+
+cond.false7.us: ; preds = %for.body.us
+ %1 = load i64, ptr %i, align 8
+ %shl.us = shl i64 0, %1
+ %tobool12.not.us = icmp eq i64 %shl.us, 0
+ br i1 %tobool12.not.us, label %cond.end23.us, label %cond.true14.us
+
+cond.true14.us: ; preds = %cond.false7.us
+ %2 = load i64, ptr %i, align 8
+ %3 = trunc i64 %2 to i32
+ br label %cond.end23.us
+
+cond.end23.us: ; preds = %cond.true14.us, %cond.false7.us, %for.body.us
+ %cond24.us = phi i32 [ %3, %cond.true14.us ], [ 0, %for.body.us ], [ 0, %cond.false7.us ]
+ %4 = trunc i32 %cond24.us to i8
+ %conv28.us = mul i8 0, %4
+ %arrayidx31.us = getelementptr [2 x i64], ptr %i, i64 0, i64 %idxprom.us1
+ %5 = load i64, ptr %arrayidx31.us, align 8
+ %6 = trunc i64 %5 to i16
+ %conv32.us = xor i16 %6, 0
+ store i16 %conv32.us, ptr null, align 2
+ %add.us = add i32 %l.046.us, 4
+ %tobool.not.us = icmp eq i32 %add.us, 0
+ br i1 %tobool.not.us, label %for.cond.for.cond.cleanup_crit_edge, label %for.body.us
+
+for.cond.for.cond.cleanup_crit_edge: ; preds = %cond.end23.us
+ store i8 %4, ptr null, align 1
+ ret void
+}
+
+attributes #0 = { "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zicsr,+zifencei,+zmmul,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-b,-e,-experimental-smctr,-experimental-ssctr,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-h,-sha,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smdbltrp,-smepmp,-smmpm,-smnpm,-smrnmi,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssdbltrp,-ssnpm,-sspm,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-supm,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-svvptc,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-za64rs,-zaamo,-zabha,-zacas,-zalrsc,-zama16b,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfbfmin,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccif,-zicclsm,-ziccrse,-zicntr,-zicond,-zihintntl,-zihintpause,-zihpm,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-ztso,-zvbb,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
+
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
|
If all instructions in the MinBW chain are cast/load instructions. We cannot get any benefit when narrowing the type since no other users use it.
And this also cause the legacy cost model compute the wrong cost when the MinBW analysis indicate that the type can be narrow but not transformed in the following vplan optimizations.
This patch also fix: #115744, which the MinBW found the
%3 = trunc i64 %2 to i32can further narrow toi8.But the user of the
truncisPHIwhich remaini32. So the vplan transform cannot narrow the type.And the legacy cost model use the information from the MinBW and get the wrong cost that is misaligned to the VPlan-based cost model.