Skip to content

Conversation

@artagnon
Copy link
Contributor

No description provided.

@llvmbot llvmbot added llvm:analysis Includes value tracking, cost tables and constant folding llvm:transforms labels Jun 24, 2025
@llvmbot
Copy link
Member

llvmbot commented Jun 24, 2025

@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-llvm-analysis

Author: Ramkumar Ramachandra (artagnon)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/145545.diff

2 Files Affected:

  • (modified) llvm/lib/Analysis/VectorUtils.cpp (+6)
  • (modified) llvm/test/Transforms/LoopVectorize/intrinsic.ll (+209)
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 63fccee63c0ae..a391e92e84fc6 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -81,6 +81,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::exp:
   case Intrinsic::exp10:
   case Intrinsic::exp2:
+  case Intrinsic::ldexp:
   case Intrinsic::log:
   case Intrinsic::log10:
   case Intrinsic::log2:
@@ -108,6 +109,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::canonicalize:
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
+  case Intrinsic::lround:
+  case Intrinsic::llround:
   case Intrinsic::lrint:
   case Intrinsic::llrint:
   case Intrinsic::ucmp:
@@ -189,6 +192,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   switch (ID) {
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
+  case Intrinsic::lround:
+  case Intrinsic::llround:
   case Intrinsic::lrint:
   case Intrinsic::llrint:
   case Intrinsic::vp_lrint:
@@ -203,6 +208,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   case Intrinsic::vp_is_fpclass:
     return OpdIdx == 0;
   case Intrinsic::powi:
+  case Intrinsic::ldexp:
     return OpdIdx == -1 || OpdIdx == 1;
   default:
     return OpdIdx == -1;
diff --git a/llvm/test/Transforms/LoopVectorize/intrinsic.ll b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
index 9c910d70807a1..32c702fd94c67 100644
--- a/llvm/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
@@ -324,6 +324,58 @@ for.end:                                          ; preds = %for.body, %entry
 
 declare double @llvm.exp2.f64(double)
 
+define void @ldexp_f32i32(i32 %n, ptr %y, ptr %x, i32 %exp) {
+; CHECK-LABEL: @ldexp_f32i32(
+; CHECK: llvm.ldexp.v4f32.v4i32
+; CHECK: ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.ldexp.f32.i32(float %0, i32 %exp)
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i32 %iv
+  store float %call, ptr %arrayidx2, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.exp2.f32.i32(float, i32)
+
+define void @ldexp_f64i32(i32 %n, ptr %y, ptr %x, i32 %exp) {
+; CHECK-LABEL: @ldexp_f64i32(
+; CHECK: llvm.ldexp.v4f64.v4i32
+; CHECK: ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call double @llvm.ldexp.f64.i32(double %0, i32 %exp)
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i32 %iv
+  store double %call, ptr %arrayidx2, align 8
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.ldexp.f64i32(double, i32)
+
 define void @log_f32(i32 %n, ptr %y, ptr %x) {
 ; CHECK-LABEL: @log_f32(
 ; CHECK: llvm.log.v4f32
@@ -976,6 +1028,163 @@ for.end:                                          ; preds = %for.body, %entry
 
 declare double @llvm.roundeven.f64(double)
 
+
+define void @lround_i32f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i32f32(
+; CHECK: llvm.lround.v4i32.v4f32
+; CHECK: ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call i32 @llvm.lround.i32.f32(float %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr %x, i32 %iv
+  store i32 %call, ptr %arrayidx2, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i32 @llvm.lround.i32.f32(float)
+
+define void @lround_i32f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i32f64(
+; CHECK: llvm.lround.v4i32.v4f64
+; CHECK: ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call i32 @llvm.lround.f64(double %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr %x, i32 %iv
+  store i32 %call, ptr %arrayidx2, align 8
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i32 @llvm.lround.i32.f64(double)
+
+define void @lround_i64f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i64f32(
+; CHECK: llvm.lround.v4i64.v4f32
+; CHECK: ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call i64 @llvm.lround.i64.f32(float %0)
+  %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+  store i64 %call, ptr %arrayidx2, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.lround.i64.f32(float)
+
+define void @lround_i64f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i64f64(
+; CHECK: llvm.lround.v4i64.v4f64
+; CHECK: ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call i64 @llvm.lround.f64(double %0)
+  %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+  store i64 %call, ptr %arrayidx2, align 8
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.lround.i64.f64(double)
+
+define void @llround_i64f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @llround_i64f32(
+; CHECK: llvm.llround.v4i64.v4f32
+; CHECK: ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call i64 @llvm.llround.i64.f32(float %0)
+  %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+  store i64 %call, ptr %arrayidx2, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.llround.i64.f32(float)
+
+define void @llround_i64f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @llround_i64f64(
+; CHECK: llvm.llround.v4i64.v4f64
+; CHECK: ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call i64 @llvm.llround.f64(double %0)
+  %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+  store i64 %call, ptr %arrayidx2, align 8
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.llround.i64.f64(double)
+
 define void @fma_f32(i32 %n, ptr %y, ptr %x, ptr %z, ptr %w) {
 ; CHECK-LABEL: @fma_f32(
 ; CHECK: llvm.fma.v4f32

Copy link
Contributor

@fhahn fhahn left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That should be fine, assuming the cost model returns reasonable costs for them. Would be good to check if we have tests for their cost (possibly just as cost-model tests)

@artagnon artagnon force-pushed the triviallyvec-missing branch from ab0360c to bff4172 Compare June 25, 2025 11:25
@artagnon
Copy link
Contributor Author

That should be fine, assuming the cost model returns reasonable costs for them. Would be good to check if we have tests for their cost (possibly just as cost-model tests)

Thanks, I've added CostModel tests, although the scalable versions have invalid costs?

@artagnon
Copy link
Contributor Author

artagnon commented Jun 25, 2025

That should be fine, assuming the cost model returns reasonable costs for them. Would be good to check if we have tests for their cost (possibly just as cost-model tests)

Thanks, I've added CostModel tests, although the scalable versions have invalid costs?

Scalable-vector versions cannot be lowered, so the invalid costs are correct.

Copy link
Contributor

@fhahn fhahn left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I gave this a try on AArch64 and it seems to enable vectorizing a few more loops, but in some cases it doesn't seem profitable, one example being https://llvm.godbolt.org/z/nb7667c71

That's even though the cost of ldexp on AArch64 is already quite high: https://llvm.godbolt.org/z/Wb1nj3Knq

cc @davemgreen in case he has additional thoughts

@fhahn fhahn requested a review from davemgreen June 27, 2025 14:36
@davemgreen
Copy link
Collaborator

I would expect the costs to be something like 10 for a libcall, and 10*4+scalarization_overhead for the vector. They might need to be overriden on AArch64 as they are marked custom, but I don't see them in BasicTTIImplBase::getTypeBasedIntrinsicInstrCost.

Can you add tests for SLP / scalarizer, as in https://reviews.llvm.org/D124358 or https://github.com/llvm/llvm-project/pull/71416/files?

@artagnon artagnon force-pushed the triviallyvec-missing branch from bff4172 to 618a972 Compare June 29, 2025 13:37
@artagnon
Copy link
Contributor Author

I would expect the costs to be something like 10 for a libcall, and 10*4+scalarization_overhead for the vector. They might need to be overriden on AArch64 as they are marked custom, but I don't see them in BasicTTIImplBase::getTypeBasedIntrinsicInstrCost.

Will investigate.

Can you add tests for SLP / scalarizer, as in https://reviews.llvm.org/D124358 or https://github.com/llvm/llvm-project/pull/71416/files?

Curiously, SLPVectorizer doesn't seem to vectorize [l]lround/ldexp.

@artagnon
Copy link
Contributor Author

I would expect the costs to be something like 10 for a libcall, and 10*4+scalarization_overhead for the vector. They might need to be overriden on AArch64 as they are marked custom, but I don't see them in BasicTTIImplBase::getTypeBasedIntrinsicInstrCost.

Will investigate.

Thanks, I think this was indeed the issue. ldexp cost has now been fixed.

@artagnon artagnon force-pushed the triviallyvec-missing branch from 7a7c232 to 1140099 Compare July 7, 2025 08:27
@artagnon
Copy link
Contributor Author

artagnon commented Jul 7, 2025

Gentle ping.

Copy link
Contributor

@arsenm arsenm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add an SLP vectorizer test for ldexp, particularly one where they use different types for the exponent

@artagnon
Copy link
Contributor Author

Gentle ping.

@artagnon artagnon merged commit 1336675 into llvm:main Jul 29, 2025
9 checks passed
@artagnon artagnon deleted the triviallyvec-missing branch July 29, 2025 18:23
omjavaid added a commit that referenced this pull request Jul 31, 2025
This reverts commit 1336675.

This broke various LLVM testsuite buildbots for AArch64 SVE, but the
problem got masked because relevant buildbots were already failing
due to other breakage.

It has broken llvm-test-suite test:
gfortran-regression-compile-regression__vect__pr106253_f.test

https://lab.llvm.org/buildbot/#/builders/4/builds/8164
https://lab.llvm.org/buildbot/#/builders/17/builds/9858
https://lab.llvm.org/buildbot/#/builders/41/builds/8067
https://lab.llvm.org/buildbot/#/builders/143/builds/9607
@omjavaid
Copy link
Contributor

This broke various LLVM buildbots testing AArch64 SVE, but the problem was masked because relevant buildbots were already failing due to another breakage.
I am going to revert this as it has been while since bots were red. Kindly look into the failure and push it again after the fix.
https://lab.llvm.org/buildbot/#/builders/4/builds/8164
https://lab.llvm.org/buildbot/#/builders/17/builds/9858
https://lab.llvm.org/buildbot/#/builders/41/builds/8067
https://lab.llvm.org/buildbot/#/builders/143/builds/9607

@artagnon
Copy link
Contributor Author

Thanks, the problem was indeed masked. I'll look into the issue.

@davemgreen
Copy link
Collaborator

They should all have an invalid cost if there is no lowering for them, preventing vectorization. https://godbolt.org/z/78hT1j98G
Maybe one of them isn't getting an invalid cost for some reason?

@davemgreen
Copy link
Collaborator

I see. The hoisted vplan node is not included in the cost calculation, even though it is vectorized.

@artagnon
Copy link
Contributor Author

artagnon commented Aug 1, 2025

I see. The hoisted vplan node is not included in the cost calculation, even though it is vectorized.

@davemgreen Thanks for the investigation: this seems to be a bug in LV. Could you kindly share a LLVM IR reproducer?

@davemgreen
Copy link
Collaborator

Does this fail for you? https://godbolt.org/z/17Ye168cK
The original is something like flang llvm-test-suite/Fortran/gfortran/regression/vect/pr106253.f -O3 -march=armv9-a

@artagnon
Copy link
Contributor Author

artagnon commented Aug 1, 2025

I tried running your example with opt -passes='default<O3>' -mtriple=aarch64 -mattr=+sve, but I get a scalar tail call i32 @llvm.lround.i32.f32(float 0.000000e+00) hoisted. My machine is not powerful enough to build flang unfortunately, and that's why I'm asking for help.

@davemgreen
Copy link
Collaborator

davemgreen commented Aug 1, 2025

Does it fail with just -passes=loop-vectorize? As in does it generate a %14 = call <vscale x 4 x i32> @llvm.lround.nxv4i32.nxv4f32(<vscale x 4 x float> zeroinitializer)?

@artagnon
Copy link
Contributor Author

artagnon commented Aug 1, 2025

Ah yes, it does indeed! And it is in vector pre.header. Thanks a lot! I will look into this bug.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

backend:AMDGPU llvm:analysis Includes value tracking, cost tables and constant folding llvm:transforms

Projects

None yet

Development

Successfully merging this pull request may close these issues.

7 participants