Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
d9d04de
[X86] Combine `store + vselect` to `masked_store``
abhishek-kaushik22 Jun 21, 2025
9eca209
Use pattern match
abhishek-kaushik22 Jun 22, 2025
c0d5cf0
Fix tests
abhishek-kaushik22 Jun 22, 2025
b3a4522
Revert last 3 commits
abhishek-kaushik22 Jun 26, 2025
04366fa
Revert "[X86] Combine `store + vselect` to `masked_store``"
abhishek-kaushik22 Jun 26, 2025
34fa965
Move to DAGCombiner
abhishek-kaushik22 Jun 26, 2025
3106f46
Update macro-fuse-cmp.ll
abhishek-kaushik22 Jun 26, 2025
8c14fba
Use allowsMisalignedMemoryAccesses to check if unaligned stores are a…
abhishek-kaushik22 Jun 27, 2025
f1b33cc
Use reachesChainWithoutSideEffects
abhishek-kaushik22 Jul 20, 2025
82180a8
Merge branch 'main' into masked-store
abhishek-kaushik22 Jul 20, 2025
63356e0
Update tests
abhishek-kaushik22 Jul 21, 2025
6602267
Test more types
abhishek-kaushik22 Jul 22, 2025
0898e47
Fix review comments and update tests
abhishek-kaushik22 Jul 23, 2025
efcf75a
Update DAGCombiner.cpp
abhishek-kaushik22 Jul 29, 2025
acbc2c1
Update DAGCombiner.cpp
abhishek-kaushik22 Jul 29, 2025
baf3d77
Merge branch 'main' into masked-store
abhishek-kaushik22 Jul 29, 2025
4485b09
Place fold at the end of visitSTORE
abhishek-kaushik22 Jul 30, 2025
ad5ead1
Merge branch 'masked-store' of https://github.com/abhishek-kaushik22/…
abhishek-kaushik22 Jul 30, 2025
ed1d804
Update DAGCombiner.cpp
abhishek-kaushik22 Aug 3, 2025
f5aed1f
Merge branch 'main' into masked-store
abhishek-kaushik22 Aug 3, 2025
6d26be2
Merge branch 'main' into masked-store
abhishek-kaushik22 Aug 4, 2025
f4157dd
Add address space check
abhishek-kaushik22 Aug 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
#include <functional>
#include <iterator>
#include <optional>
#include <queue>
#include <string>
#include <tuple>
#include <utility>
Expand Down Expand Up @@ -22451,12 +22452,81 @@ SDValue DAGCombiner::visitATOMIC_STORE(SDNode *N) {
return SDValue();
}

static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG,
const SDLoc &Dl) {
using namespace llvm::SDPatternMatch;

if (!Store->isSimple() || Store->isTruncatingStore())
return SDValue();

SDValue StoredVal = Store->getValue();
SDValue StorePtr = Store->getBasePtr();
SDValue StoreOffset = Store->getOffset();
EVT VT = Store->getMemoryVT();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();

if (!TLI.isTypeLegal(VT) || !TLI.isOperationLegalOrCustom(ISD::MSTORE, VT))
return SDValue();

SDValue Mask, TrueVec, LoadCh;
if (!sd_match(StoredVal,
m_VSelect(m_Value(Mask), m_Value(TrueVec),
m_Load(m_Value(LoadCh), m_Specific(StorePtr),
m_Specific(StoreOffset)))))
return SDValue();

LoadSDNode *Load = cast<LoadSDNode>(StoredVal.getOperand(2));
if (!Load->isSimple())
return SDValue();

auto IsSafeToFold = [](StoreSDNode *Store, LoadSDNode *Load) {
std::queue<SDValue> Worklist;

Worklist.push(Store->getChain());

while (!Worklist.empty()) {
SDValue Chain = Worklist.front();
Worklist.pop();

SDNode *Node = Chain.getNode();
if (!Node)
return false;

if (Node == Load)
return true;

if (const auto *MemNode = dyn_cast<MemSDNode>(Node))
if (!MemNode->isSimple() || MemNode->writeMem())
return false;

if (Node->getOpcode() == ISD::TokenFactor) {
for (unsigned i = 0; i < Node->getNumOperands(); ++i)
Worklist.push(Node->getOperand(i));
} else {
Worklist.push(Node->getOperand(0));
}
}

return false;
};

if (!IsSafeToFold(Store, Load))
return SDValue();

return DAG.getMaskedStore(Store->getChain(), Dl, TrueVec, StorePtr,
StoreOffset, Mask, VT, Store->getMemOperand(),
Store->getAddressingMode());
}

SDValue DAGCombiner::visitSTORE(SDNode *N) {
StoreSDNode *ST = cast<StoreSDNode>(N);
SDValue Chain = ST->getChain();
SDValue Value = ST->getValue();
SDValue Ptr = ST->getBasePtr();

if (SDValue MaskedStore = foldToMaskedStore(ST, DAG, SDLoc(N)))
return MaskedStore;

// If this is a store of a bit convert, store the input value if the
// resultant store does not need a higher alignment than the original.
if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() &&
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/CodeGen/TargetLoweringBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,12 @@ void TargetLoweringBase::initActions() {
setAtomicLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD}, ValVT, MemVT,
Expand);

for (MVT VT : MVT::all_valuetypes()) {
if (VT == MVT::Other)
continue;
setOperationAction(ISD::MSTORE, VT, Expand);
}

// We're somewhat special casing MVT::i2 and MVT::i4. Ideally we want to
// remove this and targets should individually set these types if not legal.
for (ISD::NodeType NT : enum_seq(ISD::DELETED_NODE, ISD::BUILTIN_OP_END,
Expand Down
282 changes: 282 additions & 0 deletions llvm/test/CodeGen/AArch64/combine-storetomstore.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=aarch64-- -mattr=+neon | FileCheck %s -check-prefix=AARCH64
; RUN: llc < %s -mtriple=aarch64-- -mattr=+sve | FileCheck %s -check-prefix=SVE

define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) {
; AARCH64-LABEL: test_masked_store_success:
; AARCH64: // %bb.0:
; AARCH64-NEXT: zip1 v3.8b, v2.8b, v0.8b
; AARCH64-NEXT: zip2 v2.8b, v2.8b, v0.8b
; AARCH64-NEXT: ldp q4, q5, [x0]
; AARCH64-NEXT: ushll v3.4s, v3.4h, #0
; AARCH64-NEXT: ushll v2.4s, v2.4h, #0
; AARCH64-NEXT: shl v3.4s, v3.4s, #31
; AARCH64-NEXT: shl v2.4s, v2.4s, #31
; AARCH64-NEXT: cmlt v3.4s, v3.4s, #0
; AARCH64-NEXT: cmlt v2.4s, v2.4s, #0
; AARCH64-NEXT: bif v0.16b, v4.16b, v3.16b
; AARCH64-NEXT: bif v1.16b, v5.16b, v2.16b
; AARCH64-NEXT: stp q0, q1, [x0]
; AARCH64-NEXT: ret
;
; SVE-LABEL: test_masked_store_success:
; SVE: // %bb.0:
; SVE-NEXT: // kill: def $q0 killed $q0 def $z0
; SVE-NEXT: zip2 v3.8b, v2.8b, v0.8b
; SVE-NEXT: zip1 v2.8b, v2.8b, v0.8b
; SVE-NEXT: mov x8, #4 // =0x4
; SVE-NEXT: ptrue p0.s, vl4
; SVE-NEXT: // kill: def $q1 killed $q1 def $z1
; SVE-NEXT: ushll v3.4s, v3.4h, #0
; SVE-NEXT: ushll v2.4s, v2.4h, #0
; SVE-NEXT: shl v3.4s, v3.4s, #31
; SVE-NEXT: shl v2.4s, v2.4s, #31
; SVE-NEXT: cmlt v3.4s, v3.4s, #0
; SVE-NEXT: cmlt v2.4s, v2.4s, #0
; SVE-NEXT: cmpne p1.s, p0/z, z3.s, #0
; SVE-NEXT: cmpne p0.s, p0/z, z2.s, #0
; SVE-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2]
; SVE-NEXT: st1w { z0.s }, p0, [x0]
; SVE-NEXT: ret
%load = load <8 x i32>, ptr %ptr, align 32
%sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load
store <8 x i32> %sel, ptr %ptr, align 32
ret void
}

define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) {
; AARCH64-LABEL: test_masked_store_volatile_load:
; AARCH64: // %bb.0:
; AARCH64-NEXT: zip1 v3.8b, v2.8b, v0.8b
; AARCH64-NEXT: zip2 v2.8b, v2.8b, v0.8b
; AARCH64-NEXT: ldr q4, [x0]
; AARCH64-NEXT: ldr q5, [x0, #16]
; AARCH64-NEXT: ushll v3.4s, v3.4h, #0
; AARCH64-NEXT: ushll v2.4s, v2.4h, #0
; AARCH64-NEXT: shl v3.4s, v3.4s, #31
; AARCH64-NEXT: shl v2.4s, v2.4s, #31
; AARCH64-NEXT: cmlt v3.4s, v3.4s, #0
; AARCH64-NEXT: cmlt v2.4s, v2.4s, #0
; AARCH64-NEXT: bif v0.16b, v4.16b, v3.16b
; AARCH64-NEXT: bif v1.16b, v5.16b, v2.16b
; AARCH64-NEXT: stp q0, q1, [x0]
; AARCH64-NEXT: ret
;
; SVE-LABEL: test_masked_store_volatile_load:
; SVE: // %bb.0:
; SVE-NEXT: zip1 v3.8b, v2.8b, v0.8b
; SVE-NEXT: zip2 v2.8b, v2.8b, v0.8b
; SVE-NEXT: ldr q4, [x0]
; SVE-NEXT: ldr q5, [x0, #16]
; SVE-NEXT: ushll v3.4s, v3.4h, #0
; SVE-NEXT: ushll v2.4s, v2.4h, #0
; SVE-NEXT: shl v3.4s, v3.4s, #31
; SVE-NEXT: shl v2.4s, v2.4s, #31
; SVE-NEXT: cmlt v3.4s, v3.4s, #0
; SVE-NEXT: cmlt v2.4s, v2.4s, #0
; SVE-NEXT: bif v0.16b, v4.16b, v3.16b
; SVE-NEXT: bif v1.16b, v5.16b, v2.16b
; SVE-NEXT: stp q0, q1, [x0]
; SVE-NEXT: ret
%load = load volatile <8 x i32>, ptr %ptr, align 32
%sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load
store <8 x i32> %sel, ptr %ptr, align 32
ret void
}

define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) {
; AARCH64-LABEL: test_masked_store_volatile_store:
; AARCH64: // %bb.0:
; AARCH64-NEXT: zip1 v3.8b, v2.8b, v0.8b
; AARCH64-NEXT: zip2 v2.8b, v2.8b, v0.8b
; AARCH64-NEXT: ldp q4, q5, [x0]
; AARCH64-NEXT: ushll v3.4s, v3.4h, #0
; AARCH64-NEXT: ushll v2.4s, v2.4h, #0
; AARCH64-NEXT: shl v3.4s, v3.4s, #31
; AARCH64-NEXT: shl v2.4s, v2.4s, #31
; AARCH64-NEXT: cmlt v3.4s, v3.4s, #0
; AARCH64-NEXT: cmlt v2.4s, v2.4s, #0
; AARCH64-NEXT: bif v0.16b, v4.16b, v3.16b
; AARCH64-NEXT: bif v1.16b, v5.16b, v2.16b
; AARCH64-NEXT: str q0, [x0]
; AARCH64-NEXT: str q1, [x0, #16]
; AARCH64-NEXT: ret
;
; SVE-LABEL: test_masked_store_volatile_store:
; SVE: // %bb.0:
; SVE-NEXT: zip1 v3.8b, v2.8b, v0.8b
; SVE-NEXT: zip2 v2.8b, v2.8b, v0.8b
; SVE-NEXT: ldp q4, q5, [x0]
; SVE-NEXT: ushll v3.4s, v3.4h, #0
; SVE-NEXT: ushll v2.4s, v2.4h, #0
; SVE-NEXT: shl v3.4s, v3.4s, #31
; SVE-NEXT: shl v2.4s, v2.4s, #31
; SVE-NEXT: cmlt v3.4s, v3.4s, #0
; SVE-NEXT: cmlt v2.4s, v2.4s, #0
; SVE-NEXT: bif v0.16b, v4.16b, v3.16b
; SVE-NEXT: bif v1.16b, v5.16b, v2.16b
; SVE-NEXT: str q0, [x0]
; SVE-NEXT: str q1, [x0, #16]
; SVE-NEXT: ret
%load = load <8 x i32>, ptr %ptr, align 32
%sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load
store volatile <8 x i32> %sel, ptr %ptr, align 32
ret void
}

declare void @use_vec(<8 x i32>)

define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) {
; AARCH64-LABEL: test_masked_store_intervening:
; AARCH64: // %bb.0:
; AARCH64-NEXT: sub sp, sp, #96
; AARCH64-NEXT: str d8, [sp, #64] // 8-byte Folded Spill
; AARCH64-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; AARCH64-NEXT: .cfi_def_cfa_offset 96
; AARCH64-NEXT: .cfi_offset w19, -8
; AARCH64-NEXT: .cfi_offset w30, -16
; AARCH64-NEXT: .cfi_offset b8, -32
; AARCH64-NEXT: stp q1, q0, [sp, #32] // 32-byte Folded Spill
; AARCH64-NEXT: ldp q1, q3, [x0]
; AARCH64-NEXT: movi v0.2d, #0000000000000000
; AARCH64-NEXT: fmov d8, d2
; AARCH64-NEXT: mov x19, x0
; AARCH64-NEXT: stp q1, q3, [sp] // 32-byte Folded Spill
; AARCH64-NEXT: movi v1.2d, #0000000000000000
; AARCH64-NEXT: stp q0, q0, [x0]
; AARCH64-NEXT: bl use_vec
; AARCH64-NEXT: zip2 v0.8b, v8.8b, v0.8b
; AARCH64-NEXT: ldp q3, q2, [sp, #16] // 32-byte Folded Reload
; AARCH64-NEXT: zip1 v1.8b, v8.8b, v0.8b
; AARCH64-NEXT: ushll v0.4s, v0.4h, #0
; AARCH64-NEXT: ldr d8, [sp, #64] // 8-byte Folded Reload
; AARCH64-NEXT: shl v0.4s, v0.4s, #31
; AARCH64-NEXT: ushll v1.4s, v1.4h, #0
; AARCH64-NEXT: cmlt v0.4s, v0.4s, #0
; AARCH64-NEXT: shl v1.4s, v1.4s, #31
; AARCH64-NEXT: bsl v0.16b, v2.16b, v3.16b
; AARCH64-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload
; AARCH64-NEXT: ldr q3, [sp] // 16-byte Folded Reload
; AARCH64-NEXT: cmlt v1.4s, v1.4s, #0
; AARCH64-NEXT: bsl v1.16b, v2.16b, v3.16b
; AARCH64-NEXT: stp q1, q0, [x19]
; AARCH64-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; AARCH64-NEXT: add sp, sp, #96
; AARCH64-NEXT: ret
;
; SVE-LABEL: test_masked_store_intervening:
; SVE: // %bb.0:
; SVE-NEXT: sub sp, sp, #96
; SVE-NEXT: str d8, [sp, #64] // 8-byte Folded Spill
; SVE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; SVE-NEXT: .cfi_def_cfa_offset 96
; SVE-NEXT: .cfi_offset w19, -8
; SVE-NEXT: .cfi_offset w30, -16
; SVE-NEXT: .cfi_offset b8, -32
; SVE-NEXT: stp q1, q0, [sp, #32] // 32-byte Folded Spill
; SVE-NEXT: ldp q1, q3, [x0]
; SVE-NEXT: movi v0.2d, #0000000000000000
; SVE-NEXT: fmov d8, d2
; SVE-NEXT: mov x19, x0
; SVE-NEXT: stp q1, q3, [sp] // 32-byte Folded Spill
; SVE-NEXT: movi v1.2d, #0000000000000000
; SVE-NEXT: stp q0, q0, [x0]
; SVE-NEXT: bl use_vec
; SVE-NEXT: zip2 v0.8b, v8.8b, v0.8b
; SVE-NEXT: ldp q3, q2, [sp, #16] // 32-byte Folded Reload
; SVE-NEXT: zip1 v1.8b, v8.8b, v0.8b
; SVE-NEXT: ushll v0.4s, v0.4h, #0
; SVE-NEXT: ldr d8, [sp, #64] // 8-byte Folded Reload
; SVE-NEXT: shl v0.4s, v0.4s, #31
; SVE-NEXT: ushll v1.4s, v1.4h, #0
; SVE-NEXT: cmlt v0.4s, v0.4s, #0
; SVE-NEXT: shl v1.4s, v1.4s, #31
; SVE-NEXT: bsl v0.16b, v2.16b, v3.16b
; SVE-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload
; SVE-NEXT: ldr q3, [sp] // 16-byte Folded Reload
; SVE-NEXT: cmlt v1.4s, v1.4s, #0
; SVE-NEXT: bsl v1.16b, v2.16b, v3.16b
; SVE-NEXT: stp q1, q0, [x19]
; SVE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; SVE-NEXT: add sp, sp, #96
; SVE-NEXT: ret
%load = load <8 x i32>, ptr %ptr, align 32
store <8 x i32> zeroinitializer, ptr %ptr, align 32
%tmp = load <8 x i32>, ptr %ptr
call void @use_vec(<8 x i32> %tmp)
%sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load
store <8 x i32> %sel, ptr %ptr, align 32
ret void
}


define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %cmp, <8 x i1> %cmp2) {
; AARCH64-LABEL: test_masked_store_multiple:
; AARCH64: // %bb.0:
; AARCH64-NEXT: zip1 v6.8b, v4.8b, v0.8b
; AARCH64-NEXT: zip2 v4.8b, v4.8b, v0.8b
; AARCH64-NEXT: zip1 v7.8b, v5.8b, v0.8b
; AARCH64-NEXT: zip2 v5.8b, v5.8b, v0.8b
; AARCH64-NEXT: ldp q16, q17, [x0]
; AARCH64-NEXT: ushll v6.4s, v6.4h, #0
; AARCH64-NEXT: ushll v4.4s, v4.4h, #0
; AARCH64-NEXT: ushll v7.4s, v7.4h, #0
; AARCH64-NEXT: ushll v5.4s, v5.4h, #0
; AARCH64-NEXT: shl v6.4s, v6.4s, #31
; AARCH64-NEXT: shl v4.4s, v4.4s, #31
; AARCH64-NEXT: shl v7.4s, v7.4s, #31
; AARCH64-NEXT: shl v5.4s, v5.4s, #31
; AARCH64-NEXT: cmlt v6.4s, v6.4s, #0
; AARCH64-NEXT: cmlt v4.4s, v4.4s, #0
; AARCH64-NEXT: cmlt v7.4s, v7.4s, #0
; AARCH64-NEXT: cmlt v5.4s, v5.4s, #0
; AARCH64-NEXT: bif v0.16b, v16.16b, v6.16b
; AARCH64-NEXT: ldp q6, q16, [x1]
; AARCH64-NEXT: bif v1.16b, v17.16b, v4.16b
; AARCH64-NEXT: bif v2.16b, v6.16b, v7.16b
; AARCH64-NEXT: bif v3.16b, v16.16b, v5.16b
; AARCH64-NEXT: stp q0, q1, [x0]
; AARCH64-NEXT: stp q2, q3, [x1]
; AARCH64-NEXT: ret
;
; SVE-LABEL: test_masked_store_multiple:
; SVE: // %bb.0:
; SVE-NEXT: // kill: def $q0 killed $q0 def $z0
; SVE-NEXT: zip2 v6.8b, v4.8b, v0.8b
; SVE-NEXT: zip1 v4.8b, v4.8b, v0.8b
; SVE-NEXT: mov x8, #4 // =0x4
; SVE-NEXT: zip2 v7.8b, v5.8b, v0.8b
; SVE-NEXT: zip1 v5.8b, v5.8b, v0.8b
; SVE-NEXT: // kill: def $q3 killed $q3 def $z3
; SVE-NEXT: // kill: def $q1 killed $q1 def $z1
; SVE-NEXT: ptrue p0.s, vl4
; SVE-NEXT: ushll v6.4s, v6.4h, #0
; SVE-NEXT: ushll v4.4s, v4.4h, #0
; SVE-NEXT: ushll v7.4s, v7.4h, #0
; SVE-NEXT: ushll v5.4s, v5.4h, #0
; SVE-NEXT: shl v6.4s, v6.4s, #31
; SVE-NEXT: shl v4.4s, v4.4s, #31
; SVE-NEXT: shl v7.4s, v7.4s, #31
; SVE-NEXT: shl v5.4s, v5.4s, #31
; SVE-NEXT: cmlt v6.4s, v6.4s, #0
; SVE-NEXT: cmlt v4.4s, v4.4s, #0
; SVE-NEXT: cmlt v7.4s, v7.4s, #0
; SVE-NEXT: cmlt v5.4s, v5.4s, #0
; SVE-NEXT: cmpne p1.s, p0/z, z6.s, #0
; SVE-NEXT: ldr q6, [x1]
; SVE-NEXT: cmpne p2.s, p0/z, z4.s, #0
; SVE-NEXT: cmpne p0.s, p0/z, z7.s, #0
; SVE-NEXT: bif v2.16b, v6.16b, v5.16b
; SVE-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2]
; SVE-NEXT: st1w { z0.s }, p2, [x0]
; SVE-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2]
; SVE-NEXT: str q2, [x1]
; SVE-NEXT: ret
%load = load <8 x i32>, ptr %ptr1, align 32
%load2 = load <8 x i32>, ptr %ptr2, align 32
%sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load
%sel2 = select <8 x i1> %cmp2, <8 x i32> %y, <8 x i32> %load2
store <8 x i32> %sel, ptr %ptr1, align 32
store <8 x i32> %sel2, ptr %ptr2, align 32
ret void
}
Loading
Loading