-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[VectorUtils] Trivially vectorize ldexp, [l]lround #145545
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-analysis Author: Ramkumar Ramachandra (artagnon) ChangesFull diff: https://github.com/llvm/llvm-project/pull/145545.diff 2 Files Affected:
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 63fccee63c0ae..a391e92e84fc6 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -81,6 +81,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::exp:
case Intrinsic::exp10:
case Intrinsic::exp2:
+ case Intrinsic::ldexp:
case Intrinsic::log:
case Intrinsic::log10:
case Intrinsic::log2:
@@ -108,6 +109,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::canonicalize:
case Intrinsic::fptosi_sat:
case Intrinsic::fptoui_sat:
+ case Intrinsic::lround:
+ case Intrinsic::llround:
case Intrinsic::lrint:
case Intrinsic::llrint:
case Intrinsic::ucmp:
@@ -189,6 +192,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
switch (ID) {
case Intrinsic::fptosi_sat:
case Intrinsic::fptoui_sat:
+ case Intrinsic::lround:
+ case Intrinsic::llround:
case Intrinsic::lrint:
case Intrinsic::llrint:
case Intrinsic::vp_lrint:
@@ -203,6 +208,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
case Intrinsic::vp_is_fpclass:
return OpdIdx == 0;
case Intrinsic::powi:
+ case Intrinsic::ldexp:
return OpdIdx == -1 || OpdIdx == 1;
default:
return OpdIdx == -1;
diff --git a/llvm/test/Transforms/LoopVectorize/intrinsic.ll b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
index 9c910d70807a1..32c702fd94c67 100644
--- a/llvm/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
@@ -324,6 +324,58 @@ for.end: ; preds = %for.body, %entry
declare double @llvm.exp2.f64(double)
+define void @ldexp_f32i32(i32 %n, ptr %y, ptr %x, i32 %exp) {
+; CHECK-LABEL: @ldexp_f32i32(
+; CHECK: llvm.ldexp.v4f32.v4i32
+; CHECK: ret void
+;
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %for.body, label %for.end
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %call = tail call float @llvm.ldexp.f32.i32(float %0, i32 %exp)
+ %arrayidx2 = getelementptr inbounds float, ptr %x, i32 %iv
+ store float %call, ptr %arrayidx2, align 4
+ %iv.next = add i32 %iv, 1
+ %exitcond = icmp eq i32 %iv.next, %n
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare float @llvm.exp2.f32.i32(float, i32)
+
+define void @ldexp_f64i32(i32 %n, ptr %y, ptr %x, i32 %exp) {
+; CHECK-LABEL: @ldexp_f64i32(
+; CHECK: llvm.ldexp.v4f64.v4i32
+; CHECK: ret void
+;
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %for.body, label %for.end
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+ %0 = load double, ptr %arrayidx, align 8
+ %call = tail call double @llvm.ldexp.f64.i32(double %0, i32 %exp)
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i32 %iv
+ store double %call, ptr %arrayidx2, align 8
+ %iv.next = add i32 %iv, 1
+ %exitcond = icmp eq i32 %iv.next, %n
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare double @llvm.ldexp.f64i32(double, i32)
+
define void @log_f32(i32 %n, ptr %y, ptr %x) {
; CHECK-LABEL: @log_f32(
; CHECK: llvm.log.v4f32
@@ -976,6 +1028,163 @@ for.end: ; preds = %for.body, %entry
declare double @llvm.roundeven.f64(double)
+
+define void @lround_i32f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i32f32(
+; CHECK: llvm.lround.v4i32.v4f32
+; CHECK: ret void
+;
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %for.body, label %for.end
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %call = tail call i32 @llvm.lround.i32.f32(float %0)
+ %arrayidx2 = getelementptr inbounds i32, ptr %x, i32 %iv
+ store i32 %call, ptr %arrayidx2, align 4
+ %iv.next = add i32 %iv, 1
+ %exitcond = icmp eq i32 %iv.next, %n
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare i32 @llvm.lround.i32.f32(float)
+
+define void @lround_i32f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i32f64(
+; CHECK: llvm.lround.v4i32.v4f64
+; CHECK: ret void
+;
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %for.body, label %for.end
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+ %0 = load double, ptr %arrayidx, align 8
+ %call = tail call i32 @llvm.lround.f64(double %0)
+ %arrayidx2 = getelementptr inbounds i32, ptr %x, i32 %iv
+ store i32 %call, ptr %arrayidx2, align 8
+ %iv.next = add i32 %iv, 1
+ %exitcond = icmp eq i32 %iv.next, %n
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare i32 @llvm.lround.i32.f64(double)
+
+define void @lround_i64f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i64f32(
+; CHECK: llvm.lround.v4i64.v4f32
+; CHECK: ret void
+;
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %for.body, label %for.end
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %call = tail call i64 @llvm.lround.i64.f32(float %0)
+ %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+ store i64 %call, ptr %arrayidx2, align 4
+ %iv.next = add i32 %iv, 1
+ %exitcond = icmp eq i32 %iv.next, %n
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare i64 @llvm.lround.i64.f32(float)
+
+define void @lround_i64f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i64f64(
+; CHECK: llvm.lround.v4i64.v4f64
+; CHECK: ret void
+;
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %for.body, label %for.end
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+ %0 = load double, ptr %arrayidx, align 8
+ %call = tail call i64 @llvm.lround.f64(double %0)
+ %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+ store i64 %call, ptr %arrayidx2, align 8
+ %iv.next = add i32 %iv, 1
+ %exitcond = icmp eq i32 %iv.next, %n
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare i64 @llvm.lround.i64.f64(double)
+
+define void @llround_i64f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @llround_i64f32(
+; CHECK: llvm.llround.v4i64.v4f32
+; CHECK: ret void
+;
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %for.body, label %for.end
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+ %0 = load float, ptr %arrayidx, align 4
+ %call = tail call i64 @llvm.llround.i64.f32(float %0)
+ %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+ store i64 %call, ptr %arrayidx2, align 4
+ %iv.next = add i32 %iv, 1
+ %exitcond = icmp eq i32 %iv.next, %n
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare i64 @llvm.llround.i64.f32(float)
+
+define void @llround_i64f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @llround_i64f64(
+; CHECK: llvm.llround.v4i64.v4f64
+; CHECK: ret void
+;
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %for.body, label %for.end
+
+for.body: ; preds = %entry, %for.body
+ %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+ %0 = load double, ptr %arrayidx, align 8
+ %call = tail call i64 @llvm.llround.f64(double %0)
+ %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+ store i64 %call, ptr %arrayidx2, align 8
+ %iv.next = add i32 %iv, 1
+ %exitcond = icmp eq i32 %iv.next, %n
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare i64 @llvm.llround.i64.f64(double)
+
define void @fma_f32(i32 %n, ptr %y, ptr %x, ptr %z, ptr %w) {
; CHECK-LABEL: @fma_f32(
; CHECK: llvm.fma.v4f32
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That should be fine, assuming the cost model returns reasonable costs for them. Would be good to check if we have tests for their cost (possibly just as cost-model tests)
ab0360c to
bff4172
Compare
Thanks, I've added CostModel tests, although the scalable versions have invalid costs? |
Scalable-vector versions cannot be lowered, so the invalid costs are correct. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I gave this a try on AArch64 and it seems to enable vectorizing a few more loops, but in some cases it doesn't seem profitable, one example being https://llvm.godbolt.org/z/nb7667c71
That's even though the cost of ldexp on AArch64 is already quite high: https://llvm.godbolt.org/z/Wb1nj3Knq
cc @davemgreen in case he has additional thoughts
|
I would expect the costs to be something like 10 for a libcall, and 10*4+scalarization_overhead for the vector. They might need to be overriden on AArch64 as they are marked custom, but I don't see them in BasicTTIImplBase::getTypeBasedIntrinsicInstrCost. Can you add tests for SLP / scalarizer, as in https://reviews.llvm.org/D124358 or https://github.com/llvm/llvm-project/pull/71416/files? |
bff4172 to
618a972
Compare
Will investigate.
Curiously, SLPVectorizer doesn't seem to vectorize [l]lround/ldexp. |
Thanks, I think this was indeed the issue. ldexp cost has now been fixed. |
7a7c232 to
1140099
Compare
|
Gentle ping. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add an SLP vectorizer test for ldexp, particularly one where they use different types for the exponent
|
Gentle ping. |
This reverts commit 1336675. This broke various LLVM testsuite buildbots for AArch64 SVE, but the problem got masked because relevant buildbots were already failing due to other breakage. It has broken llvm-test-suite test: gfortran-regression-compile-regression__vect__pr106253_f.test https://lab.llvm.org/buildbot/#/builders/4/builds/8164 https://lab.llvm.org/buildbot/#/builders/17/builds/9858 https://lab.llvm.org/buildbot/#/builders/41/builds/8067 https://lab.llvm.org/buildbot/#/builders/143/builds/9607
|
This broke various LLVM buildbots testing AArch64 SVE, but the problem was masked because relevant buildbots were already failing due to another breakage. |
|
Thanks, the problem was indeed masked. I'll look into the issue. |
|
They should all have an invalid cost if there is no lowering for them, preventing vectorization. https://godbolt.org/z/78hT1j98G |
|
I see. The hoisted vplan node is not included in the cost calculation, even though it is vectorized. |
@davemgreen Thanks for the investigation: this seems to be a bug in LV. Could you kindly share a LLVM IR reproducer? |
|
Does this fail for you? https://godbolt.org/z/17Ye168cK |
|
I tried running your example with |
|
Does it fail with just |
|
Ah yes, it does indeed! And it is in vector pre.header. Thanks a lot! I will look into this bug. |
No description provided.