llvm · SamTebbs33 · Sep 2, 2025 · Nov 15, 2024 · Jan 10, 2025 · Jan 15, 2025
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
@@ -24128,8 +24128,7 @@ Overview:
 Given a vector load from %ptrA followed by a vector store to %ptrB, this
 instruction generates a mask where an active lane indicates that the
 write-after-read sequence can be performed safely for that lane, without the
-danger of it turning into a read-after-write sequence and introducing a
-store-to-load forwarding hazard.
+danger of a write-after-read hazard occurring.
 
 A write-after-read hazard occurs when a write-after-read sequence for a given
 lane in a vector ends up being executed as a read-after-write sequence due to
@@ -24149,8 +24148,7 @@ The intrinsic returns ``poison`` if the distance between ``%prtA`` and ``%ptrB``
 is smaller than ``VF * %elementsize`` and either ``%ptrA + VF * %elementSize``
 or ``%ptrB + VF * %elementSize`` wrap.
 The element of the result mask is active when loading from %ptrA then storing to
-%ptrB is safe and doesn't result in a write-after-read sequence turning into a
-read-after-write sequence, meaning that:
+%ptrB is safe and doesn't result in a write-after-read hazard:
-%ptrB is safe and doesn't result in a write-after-read hazard:
+%ptrB is safe and doesn't result in a write-after-read hazard, meaning that:
-%ptrB is safe and doesn't result in a write-after-read hazard:
+%ptrB is safe and doesn't result in a write-after-read hazard, meaning that:
 
 * (ptrB - ptrA) <= 0 (guarantees that all lanes are loaded before any stores), or
 * (ptrB - ptrA) >= elementSize * lane (guarantees that this lane is loaded
@@ -24188,13 +24186,19 @@ Overview:
 
 Given a vector store to %ptrA followed by a vector load from %ptrB, this
 instruction generates a mask where an active lane indicates that the
-read-after-write sequence can be performed safely for that lane, without the
-danger of it turning into a write-after-read sequence.
+read-after-write sequence can be performed safely for that lane, without a
+read-after-write hazard occurring or a a new store-to-load forwarding hazard
+being introduced.
-read-after-write sequence can be performed safely for that lane, without a
-read-after-write hazard occurring or a a new store-to-load forwarding hazard
-being introduced.
+read-after-write sequence can be performed safely for that lane, without a
+read-after-write hazard or a store-to-load forwarding hazard being introduced.
-read-after-write sequence can be performed safely for that lane, without a
-read-after-write hazard occurring or a a new store-to-load forwarding hazard
-being introduced.
+read-after-write sequence can be performed safely for that lane, without a
+read-after-write hazard or a store-to-load forwarding hazard being introduced.
 
 A read-after-write hazard occurs when a read-after-write sequence for a given
 lane in a vector ends up being executed as a write-after-read sequence due to
 the aliasing of pointers.
 
+A store-to-load forwarding hazard occurs when a vector store writes to an
+address that partially overlaps with the address of a subsequent vector load.
+Only the overlapping addresses can be forwarded to the load if the data hasn't
+been written to memory yet.
-A store-to-load forwarding hazard occurs when a vector store writes to an
-address that partially overlaps with the address of a subsequent vector load.
-Only the overlapping addresses can be forwarded to the load if the data hasn't
-been written to memory yet.
+A store-to-load forwarding hazard occurs when a vector store writes to an
+address that partially overlaps with the address of a subsequent vector load,
+meaning that the vector load can't be performed until the vector store has completed.
-A store-to-load forwarding hazard occurs when a vector store writes to an
-address that partially overlaps with the address of a subsequent vector load.
-Only the overlapping addresses can be forwarded to the load if the data hasn't
-been written to memory yet.
+A store-to-load forwarding hazard occurs when a vector store writes to an
+address that partially overlaps with the address of a subsequent vector load,
+meaning that the vector load can't be performed until the vector store has completed.
+
 Arguments:
 """"""""""
 
@@ -24212,8 +24216,8 @@ The element of the result mask is active when storing to %ptrA then loading from
 %ptrB is safe and doesn't result in aliasing, meaning that:
 
 * abs(ptrB - ptrA) >= elementSize * lane (guarantees that the store of this lane
-  occurs before loading from this address)
-* ptrA == ptrB doesn't introduce any new hazards and is safe
+  occurs before loading from this address), or
+* ptrA == ptrB, doesn't introduce any new hazards
-* ptrA == ptrB, doesn't introduce any new hazards
+* ptrA == ptrB (doesn't introduce any new hazards that weren't present in scalar code)
-* ptrA == ptrB, doesn't introduce any new hazards
+* ptrA == ptrB (doesn't introduce any new hazards that weren't present in scalar code)
 
 Examples:
 """""""""

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -870,6 +870,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   // Vector Result Scalarization: <1 x ty> -> ty.
   void ScalarizeVectorResult(SDNode *N, unsigned ResNo);
   SDValue ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
+  SDValue ScalarizeVecRes_LOOP_DEPENDENCE_MASK(SDNode *N);
   SDValue ScalarizeVecRes_BinOp(SDNode *N);
   SDValue ScalarizeVecRes_CMP(SDNode *N);
   SDValue ScalarizeVecRes_TernaryOp(SDNode *N);

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -53,6 +53,10 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
     report_fatal_error("Do not know how to scalarize the result of this "
                        "operator!\n");
 
+  case ISD::LOOP_DEPENDENCE_WAR_MASK:
+  case ISD::LOOP_DEPENDENCE_RAW_MASK:
+    R = ScalarizeVecRes_LOOP_DEPENDENCE_MASK(N);
+    break;
   case ISD::MERGE_VALUES:      R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break;
   case ISD::BITCAST:           R = ScalarizeVecRes_BITCAST(N); break;
   case ISD::BUILD_VECTOR:      R = ScalarizeVecRes_BUILD_VECTOR(N); break;
@@ -396,6 +400,22 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
   return GetScalarizedVector(Op);
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_LOOP_DEPENDENCE_MASK(SDNode *N) {
+  SDValue SourceValue = N->getOperand(0);
+  SDValue SinkValue = N->getOperand(1);
+  SDValue EltSize = N->getOperand(2);
+  EVT PtrVT = SourceValue->getValueType(0);
+  SDLoc DL(N);
+
+  SDValue Diff = DAG.getNode(ISD::SUB, DL, PtrVT, SinkValue, SourceValue);
+  EVT CmpVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                                     Diff.getValueType());
+  SDValue Zero = DAG.getTargetConstant(0, DL, PtrVT);
+  return DAG.getNode(ISD::OR, DL, CmpVT,
+                     DAG.getSetCC(DL, CmpVT, Diff, EltSize, ISD::SETGE),
+                     DAG.getSetCC(DL, CmpVT, Diff, Zero, ISD::SETEQ));
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
   SDValue Op = N->getOperand(0);
   if (getTypeAction(Op.getValueType()) == TargetLowering::TypeScalarizeVector)

diff --git a/llvm/test/CodeGen/AArch64/alias_mask.ll b/llvm/test/CodeGen/AArch64/alias_mask.ll
@@ -784,3 +784,115 @@ entry:
   %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 3)
   ret <16 x i1> %0
 }
+
+define <1 x i1> @whilewr_8_scalarize(ptr %a, ptr %b) {
+; CHECK-LABEL: whilewr_8_scalarize:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subs x8, x1, x0
+; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    cmp x1, x0
+; CHECK-NEXT:    csinc w0, w8, wzr, ne
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
-  %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
+  %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 1)
-  %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1)
+  %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 1)
+  ret <1 x i1> %0
+}
+
+define <1 x i1> @whilewr_16_scalarize(ptr %a, ptr %b) {
+; CHECK-LABEL: whilewr_16_scalarize:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subs x8, x1, x0
+; CHECK-NEXT:    cmp x8, #1
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    cmp x1, x0
+; CHECK-NEXT:    csinc w0, w8, wzr, ne
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 2)
+  ret <1 x i1> %0
+}
+
+define <1 x i1> @whilewr_32_scalarize(ptr %a, ptr %b) {
+; CHECK-LABEL: whilewr_32_scalarize:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subs x8, x1, x0
+; CHECK-NEXT:    cmp x8, #3
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    cmp x1, x0
+; CHECK-NEXT:    csinc w0, w8, wzr, ne
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 4)
+  ret <1 x i1> %0
+}
+
+define <1 x i1> @whilewr_64_scalarize(ptr %a, ptr %b) {
+; CHECK-LABEL: whilewr_64_scalarize:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subs x8, x1, x0
+; CHECK-NEXT:    cmp x8, #7
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    cmp x1, x0
+; CHECK-NEXT:    csinc w0, w8, wzr, ne
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8)
+  ret <1 x i1> %0
+}
+
+define <1 x i1> @whilerw_8_scalarize(ptr %a, ptr %b) {
+; CHECK-LABEL: whilerw_8_scalarize:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subs x8, x1, x0
+; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    cmp x1, x0
+; CHECK-NEXT:    csinc w0, w8, wzr, ne
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1)
+  ret <1 x i1> %0
+}
+
+define <1 x i1> @whilerw_16_scalarize(ptr %a, ptr %b) {
+; CHECK-LABEL: whilerw_16_scalarize:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subs x8, x1, x0
+; CHECK-NEXT:    cmp x8, #1
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    cmp x1, x0
+; CHECK-NEXT:    csinc w0, w8, wzr, ne
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 2)
+  ret <1 x i1> %0
+}
+
+define <1 x i1> @whilerw_32_scalarize(ptr %a, ptr %b) {
+; CHECK-LABEL: whilerw_32_scalarize:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subs x8, x1, x0
+; CHECK-NEXT:    cmp x8, #3
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    cmp x1, x0
+; CHECK-NEXT:    csinc w0, w8, wzr, ne
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 4)
+  ret <1 x i1> %0
+}
+
+define <1 x i1> @whilerw_64_scalarize(ptr %a, ptr %b) {
+; CHECK-LABEL: whilerw_64_scalarize:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subs x8, x1, x0
+; CHECK-NEXT:    cmp x8, #7
+; CHECK-NEXT:    cset w8, gt
+; CHECK-NEXT:    cmp x1, x0
+; CHECK-NEXT:    csinc w0, w8, wzr, ne
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 8)
+  ret <1 x i1> %0
+}