llvm · abhishek-kaushik22 · Aug 4, 2025 · Jun 21, 2025 · Jun 22, 2025 · Jun 22, 2025
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -71,6 +71,7 @@
 #include <functional>
 #include <iterator>
 #include <optional>
+#include <queue>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -22451,12 +22452,81 @@ SDValue DAGCombiner::visitATOMIC_STORE(SDNode *N) {
   return SDValue();
 }
 
+static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG,
+                                 const SDLoc &Dl) {
+  using namespace llvm::SDPatternMatch;
+
+  if (!Store->isSimple() || Store->isTruncatingStore())
+    return SDValue();
+
+  SDValue StoredVal = Store->getValue();
+  SDValue StorePtr = Store->getBasePtr();
+  SDValue StoreOffset = Store->getOffset();
+  EVT VT = Store->getMemoryVT();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  if (!TLI.isTypeLegal(VT) || !TLI.isOperationLegalOrCustom(ISD::MSTORE, VT))
+    return SDValue();
+
+  SDValue Mask, TrueVec, LoadCh;
+  if (!sd_match(StoredVal,
+                m_VSelect(m_Value(Mask), m_Value(TrueVec),
+                          m_Load(m_Value(LoadCh), m_Specific(StorePtr),
+                                 m_Specific(StoreOffset)))))
+    return SDValue();
+
+  LoadSDNode *Load = cast<LoadSDNode>(StoredVal.getOperand(2));
+  if (!Load->isSimple())
+    return SDValue();
+
+  auto IsSafeToFold = [](StoreSDNode *Store, LoadSDNode *Load) {
+    std::queue<SDValue> Worklist;
+
+    Worklist.push(Store->getChain());
+
+    while (!Worklist.empty()) {
+      SDValue Chain = Worklist.front();
+      Worklist.pop();
+
+      SDNode *Node = Chain.getNode();
+      if (!Node)
+        return false;
+
+      if (Node == Load)
+        return true;
+
+      if (const auto *MemNode = dyn_cast<MemSDNode>(Node))
+        if (!MemNode->isSimple() || MemNode->writeMem())
+          return false;
+
+      if (Node->getOpcode() == ISD::TokenFactor) {
+        for (unsigned i = 0; i < Node->getNumOperands(); ++i)
+          Worklist.push(Node->getOperand(i));
+      } else {
+        Worklist.push(Node->getOperand(0));
+      }
+    }
+
+    return false;
+  };
+
+  if (!IsSafeToFold(Store, Load))
+    return SDValue();
+
+  return DAG.getMaskedStore(Store->getChain(), Dl, TrueVec, StorePtr,
+                            StoreOffset, Mask, VT, Store->getMemOperand(),
+                            Store->getAddressingMode());
+}
+
 SDValue DAGCombiner::visitSTORE(SDNode *N) {
   StoreSDNode *ST  = cast<StoreSDNode>(N);
   SDValue Chain = ST->getChain();
   SDValue Value = ST->getValue();
   SDValue Ptr   = ST->getBasePtr();
 
+  if (SDValue MaskedStore = foldToMaskedStore(ST, DAG, SDLoc(N)))
+    return MaskedStore;
+
   // If this is a store of a bit convert, store the input value if the
   // resultant store does not need a higher alignment than the original.
   if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() &&

diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -691,6 +691,12 @@ void TargetLoweringBase::initActions() {
       setAtomicLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD}, ValVT, MemVT,
                              Expand);
 
+  for (MVT VT : MVT::all_valuetypes()) {
+    if (VT == MVT::Other)
+      continue;
+    setOperationAction(ISD::MSTORE, VT, Expand);
+  }
+
   // We're somewhat special casing MVT::i2 and MVT::i4. Ideally we want to
   // remove this and targets should individually set these types if not legal.
   for (ISD::NodeType NT : enum_seq(ISD::DELETED_NODE, ISD::BUILTIN_OP_END,

diff --git a/llvm/test/CodeGen/AArch64/combine-storetomstore.ll b/llvm/test/CodeGen/AArch64/combine-storetomstore.ll
@@ -0,0 +1,282 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64-- -mattr=+neon   | FileCheck %s -check-prefix=AARCH64
+; RUN: llc < %s -mtriple=aarch64-- -mattr=+sve    | FileCheck %s -check-prefix=SVE
+
+define void @test_masked_store_success(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) {
+; AARCH64-LABEL: test_masked_store_success:
+; AARCH64:       // %bb.0:
+; AARCH64-NEXT:    zip1 v3.8b, v2.8b, v0.8b
+; AARCH64-NEXT:    zip2 v2.8b, v2.8b, v0.8b
+; AARCH64-NEXT:    ldp q4, q5, [x0]
+; AARCH64-NEXT:    ushll v3.4s, v3.4h, #0
+; AARCH64-NEXT:    ushll v2.4s, v2.4h, #0
+; AARCH64-NEXT:    shl v3.4s, v3.4s, #31
+; AARCH64-NEXT:    shl v2.4s, v2.4s, #31
+; AARCH64-NEXT:    cmlt v3.4s, v3.4s, #0
+; AARCH64-NEXT:    cmlt v2.4s, v2.4s, #0
+; AARCH64-NEXT:    bif v0.16b, v4.16b, v3.16b
+; AARCH64-NEXT:    bif v1.16b, v5.16b, v2.16b
+; AARCH64-NEXT:    stp q0, q1, [x0]
+; AARCH64-NEXT:    ret
+;
+; SVE-LABEL: test_masked_store_success:
+; SVE:       // %bb.0:
+; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE-NEXT:    zip2 v3.8b, v2.8b, v0.8b
+; SVE-NEXT:    zip1 v2.8b, v2.8b, v0.8b
+; SVE-NEXT:    mov x8, #4 // =0x4
+; SVE-NEXT:    ptrue p0.s, vl4
+; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; SVE-NEXT:    ushll v3.4s, v3.4h, #0
+; SVE-NEXT:    ushll v2.4s, v2.4h, #0
+; SVE-NEXT:    shl v3.4s, v3.4s, #31
+; SVE-NEXT:    shl v2.4s, v2.4s, #31
+; SVE-NEXT:    cmlt v3.4s, v3.4s, #0
+; SVE-NEXT:    cmlt v2.4s, v2.4s, #0
+; SVE-NEXT:    cmpne p1.s, p0/z, z3.s, #0
+; SVE-NEXT:    cmpne p0.s, p0/z, z2.s, #0
+; SVE-NEXT:    st1w { z1.s }, p1, [x0, x8, lsl #2]
+; SVE-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE-NEXT:    ret
+  %load = load <8 x i32>, ptr %ptr, align 32
+  %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load
+  store <8 x i32> %sel, ptr %ptr, align 32
+  ret void
+}
+
+define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) {
+; AARCH64-LABEL: test_masked_store_volatile_load:
+; AARCH64:       // %bb.0:
+; AARCH64-NEXT:    zip1 v3.8b, v2.8b, v0.8b
+; AARCH64-NEXT:    zip2 v2.8b, v2.8b, v0.8b
+; AARCH64-NEXT:    ldr q4, [x0]
+; AARCH64-NEXT:    ldr q5, [x0, #16]
+; AARCH64-NEXT:    ushll v3.4s, v3.4h, #0
+; AARCH64-NEXT:    ushll v2.4s, v2.4h, #0
+; AARCH64-NEXT:    shl v3.4s, v3.4s, #31
+; AARCH64-NEXT:    shl v2.4s, v2.4s, #31
+; AARCH64-NEXT:    cmlt v3.4s, v3.4s, #0
+; AARCH64-NEXT:    cmlt v2.4s, v2.4s, #0
+; AARCH64-NEXT:    bif v0.16b, v4.16b, v3.16b
+; AARCH64-NEXT:    bif v1.16b, v5.16b, v2.16b
+; AARCH64-NEXT:    stp q0, q1, [x0]
+; AARCH64-NEXT:    ret
+;
+; SVE-LABEL: test_masked_store_volatile_load:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 v3.8b, v2.8b, v0.8b
+; SVE-NEXT:    zip2 v2.8b, v2.8b, v0.8b
+; SVE-NEXT:    ldr q4, [x0]
+; SVE-NEXT:    ldr q5, [x0, #16]
+; SVE-NEXT:    ushll v3.4s, v3.4h, #0
+; SVE-NEXT:    ushll v2.4s, v2.4h, #0
+; SVE-NEXT:    shl v3.4s, v3.4s, #31
+; SVE-NEXT:    shl v2.4s, v2.4s, #31
+; SVE-NEXT:    cmlt v3.4s, v3.4s, #0
+; SVE-NEXT:    cmlt v2.4s, v2.4s, #0
+; SVE-NEXT:    bif v0.16b, v4.16b, v3.16b
+; SVE-NEXT:    bif v1.16b, v5.16b, v2.16b
+; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    ret
+  %load = load volatile <8 x i32>, ptr %ptr, align 32
+  %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load
+  store <8 x i32> %sel, ptr %ptr, align 32
+  ret void
+}
+
+define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) {
+; AARCH64-LABEL: test_masked_store_volatile_store:
+; AARCH64:       // %bb.0:
+; AARCH64-NEXT:    zip1 v3.8b, v2.8b, v0.8b
+; AARCH64-NEXT:    zip2 v2.8b, v2.8b, v0.8b
+; AARCH64-NEXT:    ldp q4, q5, [x0]
+; AARCH64-NEXT:    ushll v3.4s, v3.4h, #0
+; AARCH64-NEXT:    ushll v2.4s, v2.4h, #0
+; AARCH64-NEXT:    shl v3.4s, v3.4s, #31
+; AARCH64-NEXT:    shl v2.4s, v2.4s, #31
+; AARCH64-NEXT:    cmlt v3.4s, v3.4s, #0
+; AARCH64-NEXT:    cmlt v2.4s, v2.4s, #0
+; AARCH64-NEXT:    bif v0.16b, v4.16b, v3.16b
+; AARCH64-NEXT:    bif v1.16b, v5.16b, v2.16b
+; AARCH64-NEXT:    str q0, [x0]
+; AARCH64-NEXT:    str q1, [x0, #16]
+; AARCH64-NEXT:    ret
+;
+; SVE-LABEL: test_masked_store_volatile_store:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 v3.8b, v2.8b, v0.8b
+; SVE-NEXT:    zip2 v2.8b, v2.8b, v0.8b
+; SVE-NEXT:    ldp q4, q5, [x0]
+; SVE-NEXT:    ushll v3.4s, v3.4h, #0
+; SVE-NEXT:    ushll v2.4s, v2.4h, #0
+; SVE-NEXT:    shl v3.4s, v3.4s, #31
+; SVE-NEXT:    shl v2.4s, v2.4s, #31
+; SVE-NEXT:    cmlt v3.4s, v3.4s, #0
+; SVE-NEXT:    cmlt v2.4s, v2.4s, #0
+; SVE-NEXT:    bif v0.16b, v4.16b, v3.16b
+; SVE-NEXT:    bif v1.16b, v5.16b, v2.16b
+; SVE-NEXT:    str q0, [x0]
+; SVE-NEXT:    str q1, [x0, #16]
+; SVE-NEXT:    ret
+  %load = load <8 x i32>, ptr %ptr, align 32
+  %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load
+  store volatile <8 x i32> %sel, ptr %ptr, align 32
+  ret void
+}
+
+declare void @use_vec(<8 x i32>)
+
+define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %cmp) {
+; AARCH64-LABEL: test_masked_store_intervening:
+; AARCH64:       // %bb.0:
+; AARCH64-NEXT:    sub sp, sp, #96
+; AARCH64-NEXT:    str d8, [sp, #64] // 8-byte Folded Spill
+; AARCH64-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; AARCH64-NEXT:    .cfi_def_cfa_offset 96
+; AARCH64-NEXT:    .cfi_offset w19, -8
+; AARCH64-NEXT:    .cfi_offset w30, -16
+; AARCH64-NEXT:    .cfi_offset b8, -32
+; AARCH64-NEXT:    stp q1, q0, [sp, #32] // 32-byte Folded Spill
+; AARCH64-NEXT:    ldp q1, q3, [x0]
+; AARCH64-NEXT:    movi v0.2d, #0000000000000000
+; AARCH64-NEXT:    fmov d8, d2
+; AARCH64-NEXT:    mov x19, x0
+; AARCH64-NEXT:    stp q1, q3, [sp] // 32-byte Folded Spill
+; AARCH64-NEXT:    movi v1.2d, #0000000000000000
+; AARCH64-NEXT:    stp q0, q0, [x0]
+; AARCH64-NEXT:    bl use_vec
+; AARCH64-NEXT:    zip2 v0.8b, v8.8b, v0.8b
+; AARCH64-NEXT:    ldp q3, q2, [sp, #16] // 32-byte Folded Reload
+; AARCH64-NEXT:    zip1 v1.8b, v8.8b, v0.8b
+; AARCH64-NEXT:    ushll v0.4s, v0.4h, #0
+; AARCH64-NEXT:    ldr d8, [sp, #64] // 8-byte Folded Reload
+; AARCH64-NEXT:    shl v0.4s, v0.4s, #31
+; AARCH64-NEXT:    ushll v1.4s, v1.4h, #0
+; AARCH64-NEXT:    cmlt v0.4s, v0.4s, #0
+; AARCH64-NEXT:    shl v1.4s, v1.4s, #31
+; AARCH64-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; AARCH64-NEXT:    ldr q2, [sp, #48] // 16-byte Folded Reload
+; AARCH64-NEXT:    ldr q3, [sp] // 16-byte Folded Reload
+; AARCH64-NEXT:    cmlt v1.4s, v1.4s, #0
+; AARCH64-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; AARCH64-NEXT:    stp q1, q0, [x19]
+; AARCH64-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; AARCH64-NEXT:    add sp, sp, #96
+; AARCH64-NEXT:    ret
+;
+; SVE-LABEL: test_masked_store_intervening:
+; SVE:       // %bb.0:
+; SVE-NEXT:    sub sp, sp, #96
+; SVE-NEXT:    str d8, [sp, #64] // 8-byte Folded Spill
+; SVE-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; SVE-NEXT:    .cfi_def_cfa_offset 96
+; SVE-NEXT:    .cfi_offset w19, -8
+; SVE-NEXT:    .cfi_offset w30, -16
+; SVE-NEXT:    .cfi_offset b8, -32
+; SVE-NEXT:    stp q1, q0, [sp, #32] // 32-byte Folded Spill
+; SVE-NEXT:    ldp q1, q3, [x0]
+; SVE-NEXT:    movi v0.2d, #0000000000000000
+; SVE-NEXT:    fmov d8, d2
+; SVE-NEXT:    mov x19, x0
+; SVE-NEXT:    stp q1, q3, [sp] // 32-byte Folded Spill
+; SVE-NEXT:    movi v1.2d, #0000000000000000
+; SVE-NEXT:    stp q0, q0, [x0]
+; SVE-NEXT:    bl use_vec
+; SVE-NEXT:    zip2 v0.8b, v8.8b, v0.8b
+; SVE-NEXT:    ldp q3, q2, [sp, #16] // 32-byte Folded Reload
+; SVE-NEXT:    zip1 v1.8b, v8.8b, v0.8b
+; SVE-NEXT:    ushll v0.4s, v0.4h, #0
+; SVE-NEXT:    ldr d8, [sp, #64] // 8-byte Folded Reload
+; SVE-NEXT:    shl v0.4s, v0.4s, #31
+; SVE-NEXT:    ushll v1.4s, v1.4h, #0
+; SVE-NEXT:    cmlt v0.4s, v0.4s, #0
+; SVE-NEXT:    shl v1.4s, v1.4s, #31
+; SVE-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; SVE-NEXT:    ldr q2, [sp, #48] // 16-byte Folded Reload
+; SVE-NEXT:    ldr q3, [sp] // 16-byte Folded Reload
+; SVE-NEXT:    cmlt v1.4s, v1.4s, #0
+; SVE-NEXT:    bsl v1.16b, v2.16b, v3.16b
+; SVE-NEXT:    stp q1, q0, [x19]
+; SVE-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; SVE-NEXT:    add sp, sp, #96
+; SVE-NEXT:    ret
+  %load = load <8 x i32>, ptr %ptr, align 32
+  store <8 x i32> zeroinitializer, ptr %ptr, align 32
+  %tmp = load <8 x i32>, ptr %ptr
+  call void @use_vec(<8 x i32> %tmp)
+  %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load
+  store <8 x i32> %sel, ptr %ptr, align 32
+  ret void
+}
+
+
+define void @test_masked_store_multiple(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %cmp, <8 x i1> %cmp2) {
+; AARCH64-LABEL: test_masked_store_multiple:
+; AARCH64:       // %bb.0:
+; AARCH64-NEXT:    zip1 v6.8b, v4.8b, v0.8b
+; AARCH64-NEXT:    zip2 v4.8b, v4.8b, v0.8b
+; AARCH64-NEXT:    zip1 v7.8b, v5.8b, v0.8b
+; AARCH64-NEXT:    zip2 v5.8b, v5.8b, v0.8b
+; AARCH64-NEXT:    ldp q16, q17, [x0]
+; AARCH64-NEXT:    ushll v6.4s, v6.4h, #0
+; AARCH64-NEXT:    ushll v4.4s, v4.4h, #0
+; AARCH64-NEXT:    ushll v7.4s, v7.4h, #0
+; AARCH64-NEXT:    ushll v5.4s, v5.4h, #0
+; AARCH64-NEXT:    shl v6.4s, v6.4s, #31
+; AARCH64-NEXT:    shl v4.4s, v4.4s, #31
+; AARCH64-NEXT:    shl v7.4s, v7.4s, #31
+; AARCH64-NEXT:    shl v5.4s, v5.4s, #31
+; AARCH64-NEXT:    cmlt v6.4s, v6.4s, #0
+; AARCH64-NEXT:    cmlt v4.4s, v4.4s, #0
+; AARCH64-NEXT:    cmlt v7.4s, v7.4s, #0
+; AARCH64-NEXT:    cmlt v5.4s, v5.4s, #0
+; AARCH64-NEXT:    bif v0.16b, v16.16b, v6.16b
+; AARCH64-NEXT:    ldp q6, q16, [x1]
+; AARCH64-NEXT:    bif v1.16b, v17.16b, v4.16b
+; AARCH64-NEXT:    bif v2.16b, v6.16b, v7.16b
+; AARCH64-NEXT:    bif v3.16b, v16.16b, v5.16b
+; AARCH64-NEXT:    stp q0, q1, [x0]
+; AARCH64-NEXT:    stp q2, q3, [x1]
+; AARCH64-NEXT:    ret
+;
+; SVE-LABEL: test_masked_store_multiple:
+; SVE:       // %bb.0:
+; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE-NEXT:    zip2 v6.8b, v4.8b, v0.8b
+; SVE-NEXT:    zip1 v4.8b, v4.8b, v0.8b
+; SVE-NEXT:    mov x8, #4 // =0x4
+; SVE-NEXT:    zip2 v7.8b, v5.8b, v0.8b
+; SVE-NEXT:    zip1 v5.8b, v5.8b, v0.8b
+; SVE-NEXT:    // kill: def $q3 killed $q3 def $z3
+; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
+; SVE-NEXT:    ptrue p0.s, vl4
+; SVE-NEXT:    ushll v6.4s, v6.4h, #0
+; SVE-NEXT:    ushll v4.4s, v4.4h, #0
+; SVE-NEXT:    ushll v7.4s, v7.4h, #0
+; SVE-NEXT:    ushll v5.4s, v5.4h, #0
+; SVE-NEXT:    shl v6.4s, v6.4s, #31
+; SVE-NEXT:    shl v4.4s, v4.4s, #31
+; SVE-NEXT:    shl v7.4s, v7.4s, #31
+; SVE-NEXT:    shl v5.4s, v5.4s, #31
+; SVE-NEXT:    cmlt v6.4s, v6.4s, #0
+; SVE-NEXT:    cmlt v4.4s, v4.4s, #0
+; SVE-NEXT:    cmlt v7.4s, v7.4s, #0
+; SVE-NEXT:    cmlt v5.4s, v5.4s, #0
+; SVE-NEXT:    cmpne p1.s, p0/z, z6.s, #0
+; SVE-NEXT:    ldr q6, [x1]
+; SVE-NEXT:    cmpne p2.s, p0/z, z4.s, #0
+; SVE-NEXT:    cmpne p0.s, p0/z, z7.s, #0
+; SVE-NEXT:    bif v2.16b, v6.16b, v5.16b
+; SVE-NEXT:    st1w { z1.s }, p1, [x0, x8, lsl #2]
+; SVE-NEXT:    st1w { z0.s }, p2, [x0]
+; SVE-NEXT:    st1w { z3.s }, p0, [x1, x8, lsl #2]
+; SVE-NEXT:    str q2, [x1]
+; SVE-NEXT:    ret
+  %load = load <8 x i32>, ptr %ptr1, align 32
+  %load2 = load <8 x i32>, ptr %ptr2, align 32
+  %sel = select <8 x i1> %cmp, <8 x i32> %x, <8 x i32> %load
+  %sel2 = select <8 x i1> %cmp2, <8 x i32> %y, <8 x i32> %load2
+  store <8 x i32> %sel, ptr %ptr1, align 32
+  store <8 x i32> %sel2, ptr %ptr2, align 32
+  ret void
+}