[ExpandVectorPredication] Expand vp.load.ff. #154440

topperc · 2025-08-19T23:42:10Z

Use masked load with only the first element possible active. Return 1 for the number of elements processed since EVL gets converted to mask first.

llvmbot · 2025-08-19T23:42:45Z

@llvm/pr-subscribers-llvm-transforms

Author: Craig Topper (topperc)

Changes

Use masked load with only the first element possible active. Return 1 for the number of elements processed since EVL gets converted to mask first.

Full diff: https://github.com/llvm/llvm-project/pull/154440.diff

2 Files Affected:

(modified) llvm/lib/CodeGen/ExpandVectorPredication.cpp (+16)
(modified) llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll (+118)

diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 753c656007703..0efff743355b4 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -465,6 +465,21 @@ CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
         VPI.getName());
     break;
   }
+  case Intrinsic::vp_load_ff: {
+    // Mask out all but the first lane.
+    Value *NewMask = ConstantInt::getFalse(MaskParam->getType());
+    NewMask = Builder.CreateInsertElement(
+        NewMask, ConstantInt::getTrue(MaskParam->getType()->getScalarType()),
+        (uint64_t)0);
+    NewMask = Builder.CreateAnd(MaskParam, NewMask);
+    Value *MaskedLoad = Builder.CreateMaskedLoad(
+        VPI.getType()->subtypes()[0], PtrParam, AlignOpt.valueOrOne(), NewMask);
+    Value *EVLResult = Builder.getInt32(1);
+    Value *InsertValue = Builder.CreateInsertValue(
+        PoisonValue::get(VPI.getType()), MaskedLoad, 0);
+    NewMemoryInst = Builder.CreateInsertValue(InsertValue, EVLResult, 1);
+    break;
+  }
   }
 
   assert(NewMemoryInst);
@@ -609,6 +624,7 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
   case Intrinsic::vp_store:
   case Intrinsic::vp_gather:
   case Intrinsic::vp_scatter:
+  case Intrinsic::vp_load_ff:
     return expandPredicationInMemoryIntrinsic(Builder, VPI);
   }
 
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll
index 81923642811d9..721dd1bd1acc1 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll
@@ -91,6 +91,61 @@ define void @vpstore_v2i64_allones_mask_vlmax(<2 x i64> %val, ptr %ptr) {
   ret void
 }
 
+define { <2 x i64>, i32 } @vpload_ff_v2i64(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: @vpload_ff_v2i64(
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i1> [[TMP2]], <i1 true, i1 false>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP3]], <2 x i64> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <2 x i64>, i32 } poison, <2 x i64> [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <2 x i64>, i32 } [[TMP5]], i32 1, 1
+; CHECK-NEXT:    ret { <2 x i64>, i32 } [[TMP6]]
+;
+  %load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
+  ret { <2 x i64>, i32 } %load
+}
+
+define { <2 x i64>, i32 } @vpload_ff_v2i64_vlmax(ptr %ptr, <2 x i1> %m) {
+; CHECK-LABEL: @vpload_ff_v2i64_vlmax(
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i1> [[M:%.*]], <i1 true, i1 false>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP1]], <2 x i64> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <2 x i64>, i32 } poison, <2 x i64> [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <2 x i64>, i32 } [[TMP3]], i32 1, 1
+; CHECK-NEXT:    ret { <2 x i64>, i32 } [[TMP4]]
+;
+  %load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> %m, i32 2)
+  ret { <2 x i64>, i32 } %load
+}
+
+define { <2 x i64>, i32 } @vpload_ff_v2i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: @vpload_ff_v2i64_allones_mask(
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], splat (i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i1> [[TMP2]], <i1 true, i1 false>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP3]], <2 x i64> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <2 x i64>, i32 } poison, <2 x i64> [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <2 x i64>, i32 } [[TMP5]], i32 1, 1
+; CHECK-NEXT:    ret { <2 x i64>, i32 } [[TMP6]]
+;
+  %load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> <i1 1, i1 1>, i32 %evl)
+  ret { <2 x i64>, i32 } %load
+}
+
+define { <2 x i64>, i32 } @vpload_ff_v2i64_allones_mask_vlmax(ptr %ptr) {
+; CHECK-LABEL: @vpload_ff_v2i64_allones_mask_vlmax(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> <i1 true, i1 false>, <2 x i64> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <2 x i64>, i32 } poison, <2 x i64> [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <2 x i64>, i32 } [[TMP2]], i32 1, 1
+; CHECK-NEXT:    ret { <2 x i64>, i32 } [[TMP3]]
+;
+  %load = call { <2 x i64>, i32 } @llvm.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> <i1 1, i1 1>, i32 2)
+  ret { <2 x i64>, i32 } %load
+}
+
 ; Scalable vectors
 define <vscale x 1 x i64> @vpload_nxv1i64(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: @vpload_nxv1i64(
@@ -196,6 +251,69 @@ define void @vpstore_nxv1i64_allones_mask_vscale(<vscale x 1 x i64> %val, ptr %p
   ret void
 }
 
+define { <vscale x 1 x i64>, i32 } @vpload_ff_nxv1i64(ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: @vpload_ff_nxv1i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]]
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = and <vscale x 1 x i1> [[TMP2]], insertelement (<vscale x 1 x i1> zeroinitializer, i1 true, i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 1 x i64>, i32 } poison, <vscale x 1 x i64> [[TMP4]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 1 x i64>, i32 } [[TMP5]], i32 1, 1
+; CHECK-NEXT:    ret { <vscale x 1 x i64>, i32 } [[TMP3]]
+;
+  %load = call { <vscale x 1 x i64>, i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
+  ret { <vscale x 1 x i64>, i32 } %load
+}
+
+define { <vscale x 1 x i64>, i32 } @vpload_ff_nxv1i64_vscale(ptr %ptr, <vscale x 1 x i1> %m) {
+; CHECK-LABEL: @vpload_ff_nxv1i64_vscale(
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = and <vscale x 1 x i1> [[M:%.*]], insertelement (<vscale x 1 x i1> zeroinitializer, i1 true, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP4]], <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 1 x i64>, i32 } poison, <vscale x 1 x i64> [[TMP2]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, i32 } [[TMP3]], i32 1, 1
+; CHECK-NEXT:    ret { <vscale x 1 x i64>, i32 } [[TMP1]]
+;
+  %vscale = call i32 @llvm.vscale.i32()
+  %vlmax = mul nuw i32 %vscale, 1
+  %load = call { <vscale x 1 x i64>, i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %vlmax)
+  ret { <vscale x 1 x i64>, i32 } %load
+}
+
+define { <vscale x 1 x i64>, i32 } @vpload_ff_nxv1i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: @vpload_ff_nxv1i64_allones_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], splat (i1 true)
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = and <vscale x 1 x i1> [[TMP2]], insertelement (<vscale x 1 x i1> zeroinitializer, i1 true, i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP6]], <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 1 x i64>, i32 } poison, <vscale x 1 x i64> [[TMP4]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 1 x i64>, i32 } [[TMP5]], i32 1, 1
+; CHECK-NEXT:    ret { <vscale x 1 x i64>, i32 } [[TMP3]]
+;
+  %load = call { <vscale x 1 x i64>, i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+  ret { <vscale x 1 x i64>, i32 } %load
+}
+
+define { <vscale x 1 x i64>, i32 } @vpload_ff_nxv1i64_allones_mask_vscale(ptr %ptr) {
+; CHECK-LABEL: @vpload_ff_nxv1i64_allones_mask_vscale(
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> insertelement (<vscale x 1 x i1> zeroinitializer, i1 true, i64 0), <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 1 x i64>, i32 } poison, <vscale x 1 x i64> [[TMP3]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, i32 } [[TMP2]], i32 1, 1
+; CHECK-NEXT:    ret { <vscale x 1 x i64>, i32 } [[TMP1]]
+;
+  %vscale = call i32 @llvm.vscale.i32()
+  %vlmax = mul nuw i32 %vscale, 1
+  %load = call { <vscale x 1 x i64>, i32 } @llvm.vp.load.ff.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %vlmax)
+  ret { <vscale x 1 x i64>, i32 } %load
+}
+
 declare i32 @llvm.vscale.i32()
 
 declare <2 x i64> @llvm.vp.load.v2i64.p0(ptr, <2 x i1>, i32)

llvm/lib/CodeGen/ExpandVectorPredication.cpp

lukel97 · 2025-08-21T13:56:42Z

llvm/lib/CodeGen/ExpandVectorPredication.cpp

+    NewMask = Builder.CreateAnd(MaskParam, NewMask);
+    Value *MaskedLoad = Builder.CreateMaskedLoad(
+        VPI.getType()->subtypes()[0], PtrParam, AlignOpt.valueOrOne(), NewMask);
+    Value *EVLResult = Builder.getInt32(1);


Should the returned scalar be umin(EVL, 1) in case EVL is 0? The LangRef seems to suggest that it should be but isn't strict about it:

The second result is usually less than evl when an exception would occur for reading that lane

The original EVL has been moved to the mask by this point. The EVL this code sees will be VLMax so its always more than 1.

Maybe we should instead use (zext (extractelt 0, newmask) to i32)?

Sorry for being pedantic, hopefully nothing ever ends up needing to expand vp.load.ff. But it seems like we should avoid returning a scalar that's greater than the original EVL.

I think that would introduce the forward progress problem Philip mentioned. Maybe we need to do this expansion before the EVL rewrite?

Yeah. Or we could keep the EVL as legal for vp.load.ff in sanitizeStrategy when the op strategy is convert. And change the op strategy to convert if the EVL isn't legal.

That way we have access to the original EVL in expandPredicationInMemoryIntrinsic

preames

The current LangRef text allows the EVL parameter to be zero. As a result, this lowering is unsound - we need to conditionally zero the first mask element based on whether the EVL is zero. We also need to return zero for the resulting VL when EVL is zero.

Do we actually need to support the EVL=0 case in LangRef? Or should we simply define that to be poison? (Edit: better would be immediate undefined behavior)

Note that the RISC-V instruction for this case does not allow a zero EVL for this case. The text reads "These instructions execute as a regular load except that they will only take a
trap caused by a synchronous exception on element 0. If element 0 raises an exception, vl is not modified, and the trap is taken. If an element > 0 raises an exception, the corresponding trap is not taken, and the vector length vl is reduced to the index of the element that would have raised an exception."

lukel97 · 2025-08-21T16:13:09Z

The current LangRef text allows the EVL parameter to be zero. As a result, this lowering is unsound - we need to conditionally zero the first mask element based on whether the EVL is zero.

The EVL is folded into the mask in foldEVLIntoMask so if the EVL is zero then the mask in the expansion will also be zero

preames · 2025-08-21T16:24:35Z

The EVL is folded into the mask in foldEVLIntoMask so if the EVL is zero then the mask in the expansion will also be zero

Is there guaranteed? If so, then yeah, that's sound modulo your point about returning 1 in this case. Hm, don't we have a forward progress problem here though? If EVL is non-zero, but the 0-th lane of the mask is, don't we need to return at least one for forward progress in an idiomatic loop?

fhahn

Thanks for following up on this!

[ExpandVectorPredication] Expand vp.load.ff.

a9f4ddc

Use masked load with only the first element possible active. Return 1 for the number of elements processed since EVL gets converted to mask first.

topperc requested review from arcbbb, lukel97 and preames August 19, 2025 23:42

llvmbot added llvm:codegen llvm:transforms labels Aug 19, 2025

topperc requested a review from fhahn August 20, 2025 16:08

lukel97 reviewed Aug 21, 2025

View reviewed changes

preames reviewed Aug 21, 2025

View reviewed changes

fhahn reviewed Aug 21, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[ExpandVectorPredication] Expand vp.load.ff. #154440

[ExpandVectorPredication] Expand vp.load.ff. #154440

Uh oh!

topperc commented Aug 19, 2025

Uh oh!

llvmbot commented Aug 19, 2025

Uh oh!

Uh oh!

lukel97 Aug 21, 2025

Uh oh!

topperc Aug 21, 2025

Uh oh!

lukel97 Aug 21, 2025

Uh oh!

topperc Aug 22, 2025

Uh oh!

lukel97 Aug 22, 2025

Uh oh!

preames left a comment •

edited

Loading

Uh oh!

lukel97 commented Aug 21, 2025

Uh oh!

preames commented Aug 21, 2025

Uh oh!

fhahn left a comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

[ExpandVectorPredication] Expand vp.load.ff. #154440

Are you sure you want to change the base?

[ExpandVectorPredication] Expand vp.load.ff. #154440

Uh oh!

Conversation

topperc commented Aug 19, 2025

Uh oh!

llvmbot commented Aug 19, 2025

Uh oh!

Uh oh!

lukel97 Aug 21, 2025

Choose a reason for hiding this comment

Uh oh!

topperc Aug 21, 2025

Choose a reason for hiding this comment

Uh oh!

lukel97 Aug 21, 2025

Choose a reason for hiding this comment

Uh oh!

topperc Aug 22, 2025

Choose a reason for hiding this comment

Uh oh!

lukel97 Aug 22, 2025

Choose a reason for hiding this comment

Uh oh!

preames left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

lukel97 commented Aug 21, 2025

Uh oh!

preames commented Aug 21, 2025

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

preames left a comment •

edited

Loading