-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[GVN] Teach GVN simple masked load/store forwarding #157689
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
11ac166
c335ed6
0d7107f
8f7fe4a
a3a6f9f
6693704
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2287,6 +2287,36 @@ bool GVNPass::processLoad(LoadInst *L) { | |
| return true; | ||
| } | ||
|
|
||
| // Attempt to process masked loads which have loaded from | ||
| // masked stores with the same mask | ||
| bool GVNPass::processMaskedLoad(IntrinsicInst *I) { | ||
| if (!MD) | ||
| return false; | ||
| MemDepResult Dep = MD->getDependency(I); | ||
| Instruction *DepInst = Dep.getInst(); | ||
| if (!DepInst || !Dep.isLocal() || !Dep.isDef()) | ||
| return false; | ||
|
|
||
| Value *Mask = I->getOperand(2); | ||
| Value *Passthrough = I->getOperand(3); | ||
| Value *StoreVal; | ||
| if (!match(DepInst, m_MaskedStore(m_Value(StoreVal), m_Value(), m_Value(), | ||
| m_Specific(Mask)))) | ||
|
||
| return false; | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If a dependency is local then it's either a Clobber or a Def, and it looks like the code below is assuming a Def. Would be good to clarify this. If so, I think you have to bail out for the clobber case as that's more complicated - see AnalyzeLoadAvailability.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added a bail-out if it's not a Def |
||
| // Remove the load but generate a select for the passthrough | ||
| Value *OpToForward = llvm::SelectInst::Create(Mask, StoreVal, Passthrough, "", | ||
| I->getIterator()); | ||
|
|
||
| ICF->removeUsersOf(I); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't if ordering matters, but I noticed that in processLoads we do this the other way around:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
| I->replaceAllUsesWith(OpToForward); | ||
| salvageAndRemoveInstruction(I); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we also need to call and similar to
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if (MSSAU)
MSSAU->removeMemoryAccess(L);This is indirectly called by salvageAndRemoveInstruction(I)
//...
removeInstruction(I)I've added the Pointer stuff though. |
||
| if (OpToForward->getType()->isPtrOrPtrVectorTy()) | ||
|
||
| MD->invalidateCachedPointerInfo(OpToForward); | ||
| ++NumGVNLoad; | ||
| return true; | ||
| } | ||
|
|
||
| /// Return a pair the first field showing the value number of \p Exp and the | ||
| /// second field showing whether it is a value number newly created. | ||
| std::pair<uint32_t, bool> | ||
|
|
@@ -2734,6 +2764,10 @@ bool GVNPass::processInstruction(Instruction *I) { | |
| return false; | ||
| } | ||
|
|
||
| if (match(I, m_Intrinsic<Intrinsic::masked_load>()) && | ||
| processMaskedLoad(cast<IntrinsicInst>(I))) | ||
| return true; | ||
|
|
||
| // For conditional branches, we can perform simple conditional propagation on | ||
| // the condition value itself. | ||
| if (BranchInst *BI = dyn_cast<BranchInst>(I)) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py | ||
| ; RUN: opt -passes=gvn -S -enable-gvn-memdep=true < %s | FileCheck %s | ||
| ; RUN: opt -passes=gvn -S -enable-gvn-memdep=false < %s | FileCheck %s --check-prefix=MEMDEPFALSE | ||
|
|
||
| define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) { | ||
| ; CHECK-LABEL: @forward_binop_with_sel( | ||
| ; CHECK-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]]) | ||
| ; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) | ||
| ; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 | ||
| ; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) | ||
| ; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]] | ||
| ; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]]) | ||
| ; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]] | ||
| ; CHECK-NEXT: ret <4 x float> [[TMP3]] | ||
| ; | ||
| ; MEMDEPFALSE-LABEL: @forward_binop_with_sel( | ||
| ; MEMDEPFALSE-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]]) | ||
| ; MEMDEPFALSE-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) | ||
| ; MEMDEPFALSE-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 | ||
| ; MEMDEPFALSE-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) | ||
| ; MEMDEPFALSE-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]] | ||
| ; MEMDEPFALSE-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]]) | ||
| ; MEMDEPFALSE-NEXT: [[LOAD_1_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 1, <4 x i1> [[MASK]], <4 x float> [[PASSTHROUGH:%.*]]) | ||
| ; MEMDEPFALSE-NEXT: ret <4 x float> [[LOAD_1_0]] | ||
| ; | ||
| %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %a, i32 %b) | ||
| %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) | ||
| %gep.0.16 = getelementptr i8, ptr %0, i32 16 | ||
| %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) | ||
| %fmul = fmul <4 x float> %load.0.0, %load.0.16 | ||
| call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask) | ||
| %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> %passthrough) | ||
| ret <4 x float> %load.1.0 | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,6 +36,152 @@ define <128 x i8> @f1(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) { | |
| ret <128 x i8> %v4 | ||
| } | ||
|
|
||
| declare <128 x i8> @llvm.masked.load.v128i8.p0(ptr, i32, <128 x i1>, <128 x i8>) | ||
| declare void @llvm.masked.store.v128i8.p0(<128 x i8>, ptr, i32, <128 x i1>) | ||
| define <4 x float> @forward_masked_load(ptr %0, ptr %1) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might be worth having a test for an arbitrary mask as well that's passed in as a function argument, i.e. we can apply the optimisation even if the mask isn't known to come from get.active.lane.mask.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've added |
||
| ; CHECK-LABEL: @forward_masked_load( | ||
| ; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer) | ||
| ; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true)) | ||
| ; CHECK-NEXT: ret <4 x float> [[TMP4]] | ||
| ; | ||
| %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4) | ||
| %load1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) | ||
| call void @llvm.masked.store.v4f32.p0(<4 x float> %load1, ptr %1, i32 1, <4 x i1> %mask) | ||
| %load2 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) | ||
| ret <4 x float> %load2 | ||
| } | ||
|
|
||
| define <4 x float> @forward_binop_splat_i1_mask(ptr %0, ptr %1) { | ||
| ; CHECK-LABEL: @forward_binop_splat_i1_mask( | ||
| ; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer) | ||
| ; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 | ||
| ; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer) | ||
| ; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]] | ||
| ; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true)) | ||
| ; CHECK-NEXT: ret <4 x float> [[FMUL]] | ||
| ; | ||
| %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4) | ||
| %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) | ||
| %gep.0.16 = getelementptr i8, ptr %0, i32 16 | ||
| %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) | ||
| %fmul = fmul <4 x float> %load.0.0, %load.0.16 | ||
| call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask) | ||
| %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) | ||
| ret <4 x float> %load.1.0 | ||
| } | ||
|
|
||
| define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) { | ||
| ; CHECK-LABEL: @forward_binop_with_sel( | ||
| ; CHECK-NEXT: [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]]) | ||
| ; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) | ||
| ; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 | ||
| ; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer) | ||
| ; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]] | ||
| ; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]]) | ||
| ; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]] | ||
| ; CHECK-NEXT: ret <4 x float> [[TMP3]] | ||
| ; | ||
| %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %a, i32 %b) | ||
| %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) | ||
| %gep.0.16 = getelementptr i8, ptr %0, i32 16 | ||
| %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer) | ||
| %fmul = fmul <4 x float> %load.0.0, %load.0.16 | ||
| call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask) | ||
| %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> %passthrough) | ||
| ret <4 x float> %load.1.0 | ||
| } | ||
|
|
||
| define <vscale x 4 x float> @forward_masked_load_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) { | ||
| ; CHECK-LABEL: @forward_masked_load_scalable( | ||
| ; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) | ||
| ; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]) | ||
| ; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]]) | ||
| ; CHECK-NEXT: [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[PASSTHROUGH]] | ||
| ; CHECK-NEXT: ret <vscale x 4 x float> [[TMP5]] | ||
| ; | ||
| %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) | ||
| %load1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough) | ||
| call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load1, ptr %1, i32 1, <vscale x 4 x i1> %mask) | ||
| %load2 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough) | ||
| ret <vscale x 4 x float> %load2 | ||
| } | ||
|
|
||
| define <vscale x 4 x float> @generate_sel_with_passthrough(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test looks the same as the one above I think?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a difference, which is that the passthrough on %load2 is emitted by the select even when %load1 has a zeroinitializer passthrough. I put this test in when I thought different passthroughs required a bailout, but I'm not sure that there's value in having both anymore. |
||
| ; CHECK-LABEL: @generate_sel_with_passthrough( | ||
| ; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) | ||
| ; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> zeroinitializer) | ||
| ; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]]) | ||
| ; CHECK-NEXT: [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[PASSTHROUGH:%.*]] | ||
| ; CHECK-NEXT: ret <vscale x 4 x float> [[TMP5]] | ||
| ; | ||
| %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) | ||
| %load1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer) | ||
| call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load1, ptr %1, i32 1, <vscale x 4 x i1> %mask) | ||
| %load2 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough) | ||
| ret <vscale x 4 x float> %load2 | ||
| } | ||
|
|
||
| define <vscale x 4 x float> @forward_binop_with_sel_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) { | ||
| ; CHECK-LABEL: @forward_binop_with_sel_scalable( | ||
| ; CHECK-NEXT: [[MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) | ||
| ; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer) | ||
| ; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 | ||
| ; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer) | ||
| ; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]] | ||
| ; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK]]) | ||
| ; CHECK-NEXT: [[TMP3:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[FMUL]], <vscale x 4 x float> [[PASSTHROUGH:%.*]] | ||
| ; CHECK-NEXT: ret <vscale x 4 x float> [[TMP3]] | ||
| ; | ||
| %mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) | ||
| %load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer) | ||
| %gep.0.16 = getelementptr i8, ptr %0, i32 16 | ||
| %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer) | ||
| %fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16 | ||
| call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask) | ||
| %load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough) | ||
| ret <vscale x 4 x float> %load.1.0 | ||
| } | ||
|
|
||
| define <vscale x 4 x float> @load_mask_differs(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) { | ||
| ; CHECK-LABEL: @load_mask_differs( | ||
| ; CHECK-NEXT: [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8) | ||
| ; CHECK-NEXT: [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) | ||
| ; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer) | ||
| ; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 | ||
| ; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer) | ||
| ; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]] | ||
| ; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]]) | ||
| ; CHECK-NEXT: [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK1]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]) | ||
| ; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD_1_0]] | ||
| ; | ||
| %mask0 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8) | ||
| %mask1 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) | ||
| %load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer) | ||
| %gep.0.16 = getelementptr i8, ptr %0, i32 16 | ||
| %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer) | ||
| %fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16 | ||
| call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask0) | ||
| %load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask1, <vscale x 4 x float> %passthrough) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In theory, we could still remove the load because mask1 is a subset of mask0 and the pointer is the same. We'd just need to create a select based on mask1 and passthrough. Again, doesn't have to be done in this patch! |
||
| ret <vscale x 4 x float> %load.1.0 | ||
| } | ||
|
|
||
| define <vscale x 4 x float> @store_mask_differs(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems to me the main difference between this test and the one above is that here there isn't any chance to optimise the final load because the size of the stored value is less than the loaded one due to mask1 being smaller than mask0.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was just to check that any mask if different in any way, even if one mask is a subset of another, gets rejected. Are you asking for any changes to either test? |
||
| ; CHECK-LABEL: @store_mask_differs( | ||
| ; CHECK-NEXT: [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8) | ||
| ; CHECK-NEXT: [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) | ||
| ; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer) | ||
| ; CHECK-NEXT: [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 | ||
| ; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer) | ||
| ; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]] | ||
| ; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK1]]) | ||
| ; CHECK-NEXT: [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]) | ||
| ; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD_1_0]] | ||
| ; | ||
| %mask0 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8) | ||
| %mask1 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4) | ||
| %load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer) | ||
| %gep.0.16 = getelementptr i8, ptr %0, i32 16 | ||
| %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer) | ||
| %fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16 | ||
| call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask1) | ||
| %load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> %passthrough) | ||
| ret <vscale x 4 x float> %load.1.0 | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking at processLoad it seems like you need an additional check:
Would be good to add a test for this too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added the test
masked-load-store-no-mem-dep.llto deal with this.