[VectorUtils] Trivially vectorize ldexp, [l]lround #145545

artagnon · 2025-06-24T16:20:40Z

No description provided.

llvmbot · 2025-06-24T16:21:07Z

@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-llvm-analysis

Author: Ramkumar Ramachandra (artagnon)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/145545.diff

2 Files Affected:

(modified) llvm/lib/Analysis/VectorUtils.cpp (+6)
(modified) llvm/test/Transforms/LoopVectorize/intrinsic.ll (+209)

diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 63fccee63c0ae..a391e92e84fc6 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -81,6 +81,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::exp:
   case Intrinsic::exp10:
   case Intrinsic::exp2:
+  case Intrinsic::ldexp:
   case Intrinsic::log:
   case Intrinsic::log10:
   case Intrinsic::log2:
@@ -108,6 +109,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::canonicalize:
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
+  case Intrinsic::lround:
+  case Intrinsic::llround:
   case Intrinsic::lrint:
   case Intrinsic::llrint:
   case Intrinsic::ucmp:
@@ -189,6 +192,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   switch (ID) {
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
+  case Intrinsic::lround:
+  case Intrinsic::llround:
   case Intrinsic::lrint:
   case Intrinsic::llrint:
   case Intrinsic::vp_lrint:
@@ -203,6 +208,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   case Intrinsic::vp_is_fpclass:
     return OpdIdx == 0;
   case Intrinsic::powi:
+  case Intrinsic::ldexp:
     return OpdIdx == -1 || OpdIdx == 1;
   default:
     return OpdIdx == -1;
diff --git a/llvm/test/Transforms/LoopVectorize/intrinsic.ll b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
index 9c910d70807a1..32c702fd94c67 100644
--- a/llvm/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
@@ -324,6 +324,58 @@ for.end:                                          ; preds = %for.body, %entry
 
 declare double @llvm.exp2.f64(double)
 
+define void @ldexp_f32i32(i32 %n, ptr %y, ptr %x, i32 %exp) {
+; CHECK-LABEL: @ldexp_f32i32(
+; CHECK: llvm.ldexp.v4f32.v4i32
+; CHECK: ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call float @llvm.ldexp.f32.i32(float %0, i32 %exp)
+  %arrayidx2 = getelementptr inbounds float, ptr %x, i32 %iv
+  store float %call, ptr %arrayidx2, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.exp2.f32.i32(float, i32)
+
+define void @ldexp_f64i32(i32 %n, ptr %y, ptr %x, i32 %exp) {
+; CHECK-LABEL: @ldexp_f64i32(
+; CHECK: llvm.ldexp.v4f64.v4i32
+; CHECK: ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call double @llvm.ldexp.f64.i32(double %0, i32 %exp)
+  %arrayidx2 = getelementptr inbounds double, ptr %x, i32 %iv
+  store double %call, ptr %arrayidx2, align 8
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.ldexp.f64i32(double, i32)
+
 define void @log_f32(i32 %n, ptr %y, ptr %x) {
 ; CHECK-LABEL: @log_f32(
 ; CHECK: llvm.log.v4f32
@@ -976,6 +1028,163 @@ for.end:                                          ; preds = %for.body, %entry
 
 declare double @llvm.roundeven.f64(double)
 
+
+define void @lround_i32f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i32f32(
+; CHECK: llvm.lround.v4i32.v4f32
+; CHECK: ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call i32 @llvm.lround.i32.f32(float %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr %x, i32 %iv
+  store i32 %call, ptr %arrayidx2, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i32 @llvm.lround.i32.f32(float)
+
+define void @lround_i32f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i32f64(
+; CHECK: llvm.lround.v4i32.v4f64
+; CHECK: ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call i32 @llvm.lround.f64(double %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr %x, i32 %iv
+  store i32 %call, ptr %arrayidx2, align 8
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i32 @llvm.lround.i32.f64(double)
+
+define void @lround_i64f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i64f32(
+; CHECK: llvm.lround.v4i64.v4f32
+; CHECK: ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call i64 @llvm.lround.i64.f32(float %0)
+  %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+  store i64 %call, ptr %arrayidx2, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.lround.i64.f32(float)
+
+define void @lround_i64f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @lround_i64f64(
+; CHECK: llvm.lround.v4i64.v4f64
+; CHECK: ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call i64 @llvm.lround.f64(double %0)
+  %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+  store i64 %call, ptr %arrayidx2, align 8
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.lround.i64.f64(double)
+
+define void @llround_i64f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @llround_i64f32(
+; CHECK: llvm.llround.v4i64.v4f32
+; CHECK: ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %y, i32 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call i64 @llvm.llround.i64.f32(float %0)
+  %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+  store i64 %call, ptr %arrayidx2, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.llround.i64.f32(float)
+
+define void @llround_i64f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @llround_i64f64(
+; CHECK: llvm.llround.v4i64.v4f64
+; CHECK: ret void
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, ptr %y, i32 %iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call i64 @llvm.llround.f64(double %0)
+  %arrayidx2 = getelementptr inbounds i64, ptr %x, i32 %iv
+  store i64 %call, ptr %arrayidx2, align 8
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.llround.i64.f64(double)
+
 define void @fma_f32(i32 %n, ptr %y, ptr %x, ptr %z, ptr %w) {
 ; CHECK-LABEL: @fma_f32(
 ; CHECK: llvm.fma.v4f32

llvm/test/Transforms/LoopVectorize/intrinsic.ll

fhahn

That should be fine, assuming the cost model returns reasonable costs for them. Would be good to check if we have tests for their cost (possibly just as cost-model tests)

llvm/test/Transforms/LoopVectorize/intrinsic.ll

artagnon · 2025-06-25T11:26:10Z

That should be fine, assuming the cost model returns reasonable costs for them. Would be good to check if we have tests for their cost (possibly just as cost-model tests)

Thanks, I've added CostModel tests, although the scalable versions have invalid costs?

artagnon · 2025-06-25T14:49:05Z

That should be fine, assuming the cost model returns reasonable costs for them. Would be good to check if we have tests for their cost (possibly just as cost-model tests)

Thanks, I've added CostModel tests, although the scalable versions have invalid costs?

Scalable-vector versions cannot be lowered, so the invalid costs are correct.

fhahn

I gave this a try on AArch64 and it seems to enable vectorizing a few more loops, but in some cases it doesn't seem profitable, one example being https://llvm.godbolt.org/z/nb7667c71

That's even though the cost of ldexp on AArch64 is already quite high: https://llvm.godbolt.org/z/Wb1nj3Knq

cc @davemgreen in case he has additional thoughts

llvm/test/Analysis/CostModel/RISCV/exp.ll

davemgreen · 2025-06-29T10:36:06Z

I would expect the costs to be something like 10 for a libcall, and 10*4+scalarization_overhead for the vector. They might need to be overriden on AArch64 as they are marked custom, but I don't see them in BasicTTIImplBase::getTypeBasedIntrinsicInstrCost.

Can you add tests for SLP / scalarizer, as in https://reviews.llvm.org/D124358 or https://github.com/llvm/llvm-project/pull/71416/files?

artagnon · 2025-06-29T13:38:15Z

I would expect the costs to be something like 10 for a libcall, and 10*4+scalarization_overhead for the vector. They might need to be overriden on AArch64 as they are marked custom, but I don't see them in BasicTTIImplBase::getTypeBasedIntrinsicInstrCost.

Will investigate.

Can you add tests for SLP / scalarizer, as in https://reviews.llvm.org/D124358 or https://github.com/llvm/llvm-project/pull/71416/files?

Curiously, SLPVectorizer doesn't seem to vectorize [l]lround/ldexp.

artagnon · 2025-06-29T13:50:22Z

I would expect the costs to be something like 10 for a libcall, and 10*4+scalarization_overhead for the vector. They might need to be overriden on AArch64 as they are marked custom, but I don't see them in BasicTTIImplBase::getTypeBasedIntrinsicInstrCost.

Will investigate.

Thanks, I think this was indeed the issue. ldexp cost has now been fixed.

artagnon · 2025-07-07T08:28:31Z

Gentle ping.

arsenm

Can you add an SLP vectorizer test for ldexp, particularly one where they use different types for the exponent

artagnon · 2025-07-29T08:38:21Z

Gentle ping.

This reverts commit 1336675. This broke various LLVM testsuite buildbots for AArch64 SVE, but the problem got masked because relevant buildbots were already failing due to other breakage. It has broken llvm-test-suite test: gfortran-regression-compile-regression__vect__pr106253_f.test https://lab.llvm.org/buildbot/#/builders/4/builds/8164 https://lab.llvm.org/buildbot/#/builders/17/builds/9858 https://lab.llvm.org/buildbot/#/builders/41/builds/8067 https://lab.llvm.org/buildbot/#/builders/143/builds/9607

omjavaid · 2025-07-31T20:25:52Z

This broke various LLVM buildbots testing AArch64 SVE, but the problem was masked because relevant buildbots were already failing due to another breakage.
I am going to revert this as it has been while since bots were red. Kindly look into the failure and push it again after the fix.
https://lab.llvm.org/buildbot/#/builders/4/builds/8164
https://lab.llvm.org/buildbot/#/builders/17/builds/9858
https://lab.llvm.org/buildbot/#/builders/41/builds/8067
https://lab.llvm.org/buildbot/#/builders/143/builds/9607

artagnon · 2025-07-31T20:27:37Z

Thanks, the problem was indeed masked. I'll look into the issue.

davemgreen · 2025-07-31T21:13:28Z

They should all have an invalid cost if there is no lowering for them, preventing vectorization. https://godbolt.org/z/78hT1j98G
Maybe one of them isn't getting an invalid cost for some reason?

davemgreen · 2025-07-31T21:40:05Z

I see. The hoisted vplan node is not included in the cost calculation, even though it is vectorized.

artagnon · 2025-08-01T05:55:33Z

I see. The hoisted vplan node is not included in the cost calculation, even though it is vectorized.

@davemgreen Thanks for the investigation: this seems to be a bug in LV. Could you kindly share a LLVM IR reproducer?

davemgreen · 2025-08-01T06:07:10Z

Does this fail for you? https://godbolt.org/z/17Ye168cK
The original is something like flang llvm-test-suite/Fortran/gfortran/regression/vect/pr106253.f -O3 -march=armv9-a

artagnon · 2025-08-01T06:25:07Z

I tried running your example with opt -passes='default<O3>' -mtriple=aarch64 -mattr=+sve, but I get a scalar tail call i32 @llvm.lround.i32.f32(float 0.000000e+00) hoisted. My machine is not powerful enough to build flang unfortunately, and that's why I'm asking for help.

davemgreen · 2025-08-01T06:29:42Z

Does it fail with just -passes=loop-vectorize? As in does it generate a %14 = call <vscale x 4 x i32> @llvm.lround.nxv4i32.nxv4f32(<vscale x 4 x float> zeroinitializer)?

artagnon · 2025-08-01T06:32:41Z

Ah yes, it does indeed! And it is in vector pre.header. Thanks a lot! I will look into this bug.

artagnon requested review from RKSimon, alexey-bataev and fhahn June 24, 2025 16:20

llvmbot added llvm:analysis Includes value tracking, cost tables and constant folding llvm:transforms labels Jun 24, 2025

RKSimon reviewed Jun 25, 2025

View reviewed changes

llvm/test/Transforms/LoopVectorize/intrinsic.ll Outdated Show resolved Hide resolved

llvm/test/Transforms/LoopVectorize/intrinsic.ll Outdated Show resolved Hide resolved

fhahn reviewed Jun 25, 2025

View reviewed changes

llvm/test/Transforms/LoopVectorize/intrinsic.ll Outdated Show resolved Hide resolved

llvm/test/Transforms/LoopVectorize/intrinsic.ll Outdated Show resolved Hide resolved

artagnon force-pushed the triviallyvec-missing branch from ab0360c to bff4172 Compare June 25, 2025 11:25

fhahn reviewed Jun 27, 2025

View reviewed changes

fhahn requested a review from davemgreen June 27, 2025 14:36

fhahn reviewed Jun 27, 2025

View reviewed changes

llvm/test/Analysis/CostModel/RISCV/exp.ll Outdated Show resolved Hide resolved

artagnon force-pushed the triviallyvec-missing branch from bff4172 to 618a972 Compare June 29, 2025 13:37

llvmbot added the backend:AMDGPU label Jun 29, 2025

artagnon added 2 commits July 7, 2025 09:27

[VectorUtils] Trivially vectorize ldexp, [l]lround

887c40c

[SLPVectorizer/Scalarizer] Add tests for [l]lround, ldexp

1140099

artagnon force-pushed the triviallyvec-missing branch from 7a7c232 to 1140099 Compare July 7, 2025 08:27

arsenm reviewed Jul 7, 2025

View reviewed changes

[SLPVectorize/test] More extensive ldexp testing

27c202a

arsenm approved these changes Jul 29, 2025

View reviewed changes

artagnon merged commit 1336675 into llvm:main Jul 29, 2025
9 checks passed

artagnon deleted the triviallyvec-missing branch July 29, 2025 18:23

artagnon mentioned this pull request Aug 1, 2025

[LV] Hoisted vector code is costed without vscale #151664

Closed

[VectorUtils] Trivially vectorize ldexp, [l]lround #145545

[VectorUtils] Trivially vectorize ldexp, [l]lround #145545

Uh oh!

Conversation

artagnon commented Jun 24, 2025

Uh oh!

llvmbot commented Jun 24, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

artagnon commented Jun 25, 2025

Uh oh!

artagnon commented Jun 25, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

davemgreen commented Jun 29, 2025

Uh oh!

artagnon commented Jun 29, 2025

Uh oh!

artagnon commented Jun 29, 2025

Uh oh!

artagnon commented Jul 7, 2025

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

artagnon commented Jul 29, 2025

Uh oh!

Uh oh!

omjavaid commented Jul 31, 2025

Uh oh!

artagnon commented Jul 31, 2025

Uh oh!

davemgreen commented Jul 31, 2025

Uh oh!

davemgreen commented Jul 31, 2025

Uh oh!

artagnon commented Aug 1, 2025

Uh oh!

davemgreen commented Aug 1, 2025

Uh oh!

artagnon commented Aug 1, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

davemgreen commented Aug 1, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

artagnon commented Aug 1, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

7 participants

llvmbot commented Jun 24, 2025 •

edited

Loading

artagnon commented Jun 25, 2025 •

edited

Loading

artagnon commented Aug 1, 2025 •

edited

Loading

davemgreen commented Aug 1, 2025 •

edited

Loading