From c943b046e5eeb5faae74783b80137593a43760e6 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Wed, 26 Jun 2024 09:55:45 +0100 Subject: [PATCH 01/19] [AArch64] Lower alias mask to a whilewr https://github.com/llvm/llvm-project/pull/100579 emits IR that creates a mask disabling lanes that could alias within a loop iteration, based on a pair of pointers. This PR lowers that IR to a WHILEWR instruction for AArch64. --- .../Target/AArch64/AArch64ISelLowering.cpp | 82 ++ .../LoopVectorize/AArch64/alias_mask.ll | 884 ++++++++++++++++++ 2 files changed, 966 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d86e52d49000a..c2e9ba6291855 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -94,6 +94,7 @@ #include #include #include +#include #include #include #include @@ -1523,6 +1524,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); + setOperationAction(ISD::OR, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); @@ -13782,8 +13784,88 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { return ResultSLI; } +/// Try to lower the construction of a pointer alias mask to a WHILEWR. +/// The mask's enabled lanes represent the elements that will not overlap across one loop iteration. +/// This tries to match: +/// or (splat (setcc_lt (sub ptrA, ptrB), -(element_size - 1))), +/// (get_active_lane_mask 0, (div (sub ptrA, ptrB), element_size)) +SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { + if (!DAG.getSubtarget().hasSVE2()) + return SDValue(); + auto LaneMask = Op.getOperand(0); + auto Splat = Op.getOperand(1); + + if (LaneMask.getOpcode() != ISD::INTRINSIC_WO_CHAIN || + LaneMask.getConstantOperandVal(0) != Intrinsic::get_active_lane_mask || + Splat.getOpcode() != ISD::SPLAT_VECTOR) + return SDValue(); + + auto Cmp = Splat.getOperand(0); + if (Cmp.getOpcode() != ISD::SETCC) + return SDValue(); + + CondCodeSDNode *Cond = dyn_cast(Cmp.getOperand(2)); + assert(Cond && "SETCC doesn't have a condition code"); + + auto ComparatorConst = dyn_cast(Cmp.getOperand(1)); + if (!ComparatorConst || ComparatorConst->getSExtValue() > 0 || + Cond->get() != ISD::CondCode::SETLT) + return SDValue(); + unsigned CompValue = std::abs(ComparatorConst->getSExtValue()); + unsigned EltSize = CompValue + 1; + if (!isPowerOf2_64(EltSize) || EltSize > 64) + return SDValue(); + + auto Diff = Cmp.getOperand(0); + if (Diff.getOpcode() != ISD::SUB || Diff.getValueType() != MVT::i64) + return SDValue(); + + auto LaneMaskConst = dyn_cast(LaneMask.getOperand(1)); + if (!LaneMaskConst || LaneMaskConst->getZExtValue() != 0 || + (EltSize != 1 && LaneMask.getOperand(2).getOpcode() != ISD::SRA)) + return SDValue(); + + // An alias mask for i8 elements omits the division because it would just divide by 1 + if (EltSize > 1) { + auto DiffDiv = LaneMask.getOperand(2); + auto DiffDivConst = dyn_cast(DiffDiv.getOperand(1)); + if (!DiffDivConst || DiffDivConst->getZExtValue() != std::log2(EltSize)) + return SDValue(); + } else if (LaneMask.getOperand(2) != Diff) + return SDValue(); + + auto StorePtr = Diff.getOperand(0); + auto ReadPtr = Diff.getOperand(1); + + unsigned IntrinsicID = 0; + switch (EltSize) { + case 1: + IntrinsicID = Intrinsic::aarch64_sve_whilewr_b; + break; + case 2: + IntrinsicID = Intrinsic::aarch64_sve_whilewr_h; + break; + case 4: + IntrinsicID = Intrinsic::aarch64_sve_whilewr_s; + break; + case 8: + IntrinsicID = Intrinsic::aarch64_sve_whilewr_d; + break; + default: + return SDValue(); + } + SDLoc DL(Op); + SDValue ID = DAG.getConstant(IntrinsicID, DL, MVT::i32); + auto N = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), ID, + StorePtr, ReadPtr); + return N; +} + SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { + + if (SDValue SV = tryWhileWRFromOR(Op, DAG)) + return SV; if (useSVEForFixedLengthVectorVT(Op.getValueType(), !Subtarget->isNeonAvailable())) return LowerToScalableOp(Op, DAG); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll new file mode 100644 index 0000000000000..3662efa41c151 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll @@ -0,0 +1,884 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve2 -o - | FileCheck %s +; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2 +define dso_local void @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB0_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: whilewr p0.b, x1, x2 +; CHECK-NEXT: mov w9, w3 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NEXT: cntp x10, p0, p0.b +; CHECK-NEXT: and x10, x10, #0xff +; CHECK-NEXT: .LBB0_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8] +; CHECK-NEXT: add z0.b, z1.b, z0.b +; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8] +; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: whilelo p1.b, x8, x9 +; CHECK-NEXT: b.mi .LBB0_2 +; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_8: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: cmp w3, #1 +; CHECK-NOSVE2-NEXT: b.lt .LBB0_3 +; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NOSVE2-NEXT: sub x9, x1, x2 +; CHECK-NOSVE2-NEXT: mov x8, xzr +; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9 +; CHECK-NOSVE2-NEXT: sbfx x9, x10, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NOSVE2-NEXT: mov w9, w3 +; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b +; CHECK-NOSVE2-NEXT: and x10, x10, #0xff +; CHECK-NOSVE2-NEXT: .LBB0_2: // %vector.body +; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8] +; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8] +; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b +; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8] +; CHECK-NOSVE2-NEXT: add x8, x8, x10 +; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9 +; CHECK-NOSVE2-NEXT: b.mi .LBB0_2 +; CHECK-NOSVE2-NEXT: .LBB0_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: ret +entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %c14 = ptrtoint ptr %c to i64 + %b15 = ptrtoint ptr %b to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %sub.diff = sub i64 %b15, %c14 + %neg.compare = icmp slt i64 %sub.diff, 0 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count) + %0 = zext %active.lane.mask.alias to + %1 = tail call i8 @llvm.vector.reduce.add.nxv16i8( %0) + %2 = zext i8 %1 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %3 = and %active.lane.mask, %active.lane.mask.alias + %4 = getelementptr inbounds i8, ptr %a, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv16i8.p0(ptr %4, i32 1, %3, poison) + %5 = getelementptr inbounds i8, ptr %b, i64 %index + %wide.masked.load16 = tail call @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, %3, poison) + %6 = add %wide.masked.load16, %wide.masked.load + %7 = getelementptr inbounds i8, ptr %c, i64 %index + tail call void @llvm.masked.store.nxv16i8.p0( %6, ptr %7, i32 1, %3) + %index.next = add i64 %index, %2 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count) + %8 = extractelement %active.lane.mask.next, i64 0 + br i1 %8, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define dso_local void @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB1_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: mov w8, w3 +; CHECK-NEXT: whilewr p1.h, x1, x2 +; CHECK-NEXT: mov x9, xzr +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: .LBB1_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1] +; CHECK-NEXT: add z0.h, z1.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x2, x9, lsl #1] +; CHECK-NEXT: inch x9 +; CHECK-NEXT: whilelo p0.h, x9, x8 +; CHECK-NEXT: b.mi .LBB1_2 +; CHECK-NEXT: .LBB1_3: // %for.cond.cleanup +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_16: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: cmp w3, #1 +; CHECK-NOSVE2-NEXT: b.lt .LBB1_3 +; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NOSVE2-NEXT: mov w9, w3 +; CHECK-NOSVE2-NEXT: sub x10, x1, x2 +; CHECK-NOSVE2-NEXT: mov x8, xzr +; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9 +; CHECK-NOSVE2-NEXT: cmn x10, #1 +; CHECK-NOSVE2-NEXT: add x10, x10, x10, lsr #63 +; CHECK-NOSVE2-NEXT: cset w11, lt +; CHECK-NOSVE2-NEXT: sbfx x11, x11, #0, #1 +; CHECK-NOSVE2-NEXT: asr x10, x10, #1 +; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x11 +; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10 +; CHECK-NOSVE2-NEXT: cnth x10 +; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b +; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: .LBB1_2: // %vector.body +; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h +; CHECK-NOSVE2-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; CHECK-NOSVE2-NEXT: add x8, x8, x10 +; CHECK-NOSVE2-NEXT: whilelo p0.h, x8, x9 +; CHECK-NOSVE2-NEXT: b.mi .LBB1_2 +; CHECK-NOSVE2-NEXT: .LBB1_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: ret +entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %b14 = ptrtoint ptr %b to i64 + %c15 = ptrtoint ptr %c to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 3 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count) + %sub.diff = sub i64 %b14, %c15 + %diff = sdiv i64 %sub.diff, 2 + %neg.compare = icmp slt i64 %sub.diff, -1 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + %2 = and %active.lane.mask.alias, %active.lane.mask.entry + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %3 = getelementptr inbounds i16, ptr %a, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv8i16.p0(ptr %3, i32 2, %active.lane.mask, poison) + %4 = getelementptr inbounds i16, ptr %b, i64 %index + %wide.masked.load16 = tail call @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, %active.lane.mask, poison) + %5 = add %wide.masked.load16, %wide.masked.load + %6 = getelementptr inbounds i16, ptr %c, i64 %index + tail call void @llvm.masked.store.nxv8i16.p0( %5, ptr %6, i32 2, %active.lane.mask) + %index.next = add i64 %index, %1 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count) + %7 = extractelement %active.lane.mask.next, i64 0 + br i1 %7, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define dso_local void @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB2_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: mov w8, w3 +; CHECK-NEXT: whilewr p1.s, x1, x2 +; CHECK-NEXT: mov x9, xzr +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: .LBB2_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x2, x9, lsl #2] +; CHECK-NEXT: incw x9 +; CHECK-NEXT: whilelo p0.s, x9, x8 +; CHECK-NEXT: b.mi .LBB2_2 +; CHECK-NEXT: .LBB2_3: // %for.cond.cleanup +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_32: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: cmp w3, #1 +; CHECK-NOSVE2-NEXT: b.lt .LBB2_3 +; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NOSVE2-NEXT: mov w9, w3 +; CHECK-NOSVE2-NEXT: sub x10, x1, x2 +; CHECK-NOSVE2-NEXT: mov x8, xzr +; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9 +; CHECK-NOSVE2-NEXT: add x11, x10, #3 +; CHECK-NOSVE2-NEXT: cmp x10, #0 +; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt +; CHECK-NOSVE2-NEXT: cmn x10, #3 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: asr x11, x11, #2 +; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p2.s, xzr, x11 +; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10 +; CHECK-NOSVE2-NEXT: cntw x10 +; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b +; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: .LBB2_2: // %vector.body +; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s +; CHECK-NOSVE2-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; CHECK-NOSVE2-NEXT: add x8, x8, x10 +; CHECK-NOSVE2-NEXT: whilelo p0.s, x8, x9 +; CHECK-NOSVE2-NEXT: b.mi .LBB2_2 +; CHECK-NOSVE2-NEXT: .LBB2_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: ret +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %b12 = ptrtoint ptr %b to i64 + %c13 = ptrtoint ptr %c to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 2 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count) + %sub.diff = sub i64 %b12, %c13 + %diff = sdiv i64 %sub.diff, 4 + %neg.compare = icmp slt i64 %sub.diff, -3 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + %2 = and %active.lane.mask.alias, %active.lane.mask.entry + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %3 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, %active.lane.mask, poison) + %4 = getelementptr inbounds i32, ptr %b, i64 %index + %wide.masked.load14 = tail call @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, %active.lane.mask, poison) + %5 = add %wide.masked.load14, %wide.masked.load + %6 = getelementptr inbounds i32, ptr %c, i64 %index + tail call void @llvm.masked.store.nxv4i32.p0( %5, ptr %6, i32 4, %active.lane.mask) + %index.next = add i64 %index, %1 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count) + %7 = extractelement %active.lane.mask.next, i64 0 + br i1 %7, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define dso_local void @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB3_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: mov w8, w3 +; CHECK-NEXT: whilewr p1.d, x1, x2 +; CHECK-NEXT: mov x9, xzr +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: .LBB3_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3] +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x2, x9, lsl #3] +; CHECK-NEXT: incd x9 +; CHECK-NEXT: whilelo p0.d, x9, x8 +; CHECK-NEXT: b.mi .LBB3_2 +; CHECK-NEXT: .LBB3_3: // %for.cond.cleanup +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_64: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: cmp w3, #1 +; CHECK-NOSVE2-NEXT: b.lt .LBB3_3 +; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NOSVE2-NEXT: mov w9, w3 +; CHECK-NOSVE2-NEXT: sub x10, x1, x2 +; CHECK-NOSVE2-NEXT: mov x8, xzr +; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9 +; CHECK-NOSVE2-NEXT: add x11, x10, #7 +; CHECK-NOSVE2-NEXT: cmp x10, #0 +; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt +; CHECK-NOSVE2-NEXT: cmn x10, #7 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: asr x11, x11, #3 +; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p2.d, xzr, x11 +; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10 +; CHECK-NOSVE2-NEXT: cntd x10 +; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b +; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: .LBB3_2: // %vector.body +; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d +; CHECK-NOSVE2-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; CHECK-NOSVE2-NEXT: add x8, x8, x10 +; CHECK-NOSVE2-NEXT: whilelo p0.d, x8, x9 +; CHECK-NOSVE2-NEXT: b.mi .LBB3_2 +; CHECK-NOSVE2-NEXT: .LBB3_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: ret +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %b12 = ptrtoint ptr %b to i64 + %c13 = ptrtoint ptr %c to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 1 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count) + %sub.diff = sub i64 %b12, %c13 + %diff = sdiv i64 %sub.diff, 8 + %neg.compare = icmp slt i64 %sub.diff, -7 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + %2 = and %active.lane.mask.alias, %active.lane.mask.entry + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %3 = getelementptr inbounds i64, ptr %a, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv2i64.p0(ptr %3, i32 8, %active.lane.mask, poison) + %4 = getelementptr inbounds i64, ptr %b, i64 %index + %wide.masked.load14 = tail call @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, %active.lane.mask, poison) + %5 = add %wide.masked.load14, %wide.masked.load + %6 = getelementptr inbounds i64, ptr %c, i64 %index + tail call void @llvm.masked.store.nxv2i64.p0( %5, ptr %6, i32 8, %active.lane.mask) + %index.next = add i64 %index, %1 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count) + %7 = extractelement %active.lane.mask.next, i64 0 + br i1 %7, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define dso_local void @whilewr_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_multiple_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB4_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: whilewr p0.b, x0, x2 +; CHECK-NEXT: mov w9, w3 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: whilewr p1.b, x1, x2 +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NEXT: cntp x10, p0, p0.b +; CHECK-NEXT: and x10, x10, #0xff +; CHECK-NEXT: .LBB4_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8] +; CHECK-NEXT: add z0.b, z1.b, z0.b +; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8] +; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: whilelo p1.b, x8, x9 +; CHECK-NEXT: b.mi .LBB4_2 +; CHECK-NEXT: .LBB4_3: // %for.cond.cleanup +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_multiple_8: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: cmp w3, #1 +; CHECK-NOSVE2-NEXT: b.lt .LBB4_3 +; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NOSVE2-NEXT: sub x9, x0, x2 +; CHECK-NOSVE2-NEXT: mov x8, xzr +; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9 +; CHECK-NOSVE2-NEXT: sub x9, x1, x2 +; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x10 +; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: whilelo p3.b, xzr, x9 +; CHECK-NOSVE2-NEXT: mov w9, w3 +; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NOSVE2-NEXT: whilelo p2.b, xzr, x10 +; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b +; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b +; CHECK-NOSVE2-NEXT: and x10, x10, #0xff +; CHECK-NOSVE2-NEXT: .LBB4_2: // %vector.body +; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8] +; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8] +; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b +; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8] +; CHECK-NOSVE2-NEXT: add x8, x8, x10 +; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9 +; CHECK-NOSVE2-NEXT: b.mi .LBB4_2 +; CHECK-NOSVE2-NEXT: .LBB4_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: ret +entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %c14 = ptrtoint ptr %c to i64 + %a15 = ptrtoint ptr %a to i64 + %b16 = ptrtoint ptr %b to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %sub.diff = sub i64 %a15, %c14 + %neg.compare = icmp slt i64 %sub.diff, 0 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + %sub.diff18 = sub i64 %b16, %c14 + %neg.compare20 = icmp slt i64 %sub.diff18, 0 + %.splatinsert21 = insertelement poison, i1 %neg.compare20, i64 0 + %.splat22 = shufflevector %.splatinsert21, poison, zeroinitializer + %ptr.diff.lane.mask23 = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff18) + %active.lane.mask.alias24 = or %ptr.diff.lane.mask23, %.splat22 + %0 = and %active.lane.mask.alias, %active.lane.mask.alias24 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count) + %1 = zext %0 to + %2 = tail call i8 @llvm.vector.reduce.add.nxv16i8( %1) + %3 = zext i8 %2 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %4 = and %active.lane.mask, %0 + %5 = getelementptr inbounds i8, ptr %a, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, %4, poison) + %6 = getelementptr inbounds i8, ptr %b, i64 %index + %wide.masked.load25 = tail call @llvm.masked.load.nxv16i8.p0(ptr %6, i32 1, %4, poison) + %7 = add %wide.masked.load25, %wide.masked.load + %8 = getelementptr inbounds i8, ptr %c, i64 %index + tail call void @llvm.masked.store.nxv16i8.p0( %7, ptr %8, i32 1, %4) + %index.next = add i64 %index, %3 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count) + %9 = extractelement %active.lane.mask.next, i64 0 + br i1 %9, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define dso_local void @whilewr_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_multiple_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB5_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: whilewr p0.h, x0, x2 +; CHECK-NEXT: mov w9, w3 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: whilewr p1.h, x1, x2 +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-NEXT: whilelo p1.h, xzr, x9 +; CHECK-NEXT: cntp x10, p0, p0.h +; CHECK-NEXT: and x10, x10, #0xff +; CHECK-NEXT: .LBB5_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1] +; CHECK-NEXT: add z0.h, z1.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1] +; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: whilelo p1.h, x8, x9 +; CHECK-NEXT: b.mi .LBB5_2 +; CHECK-NEXT: .LBB5_3: // %for.cond.cleanup +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_multiple_16: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: cmp w3, #1 +; CHECK-NOSVE2-NEXT: b.lt .LBB5_3 +; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NOSVE2-NEXT: sub x9, x0, x2 +; CHECK-NOSVE2-NEXT: mov x8, xzr +; CHECK-NOSVE2-NEXT: cmn x9, #1 +; CHECK-NOSVE2-NEXT: add x9, x9, x9, lsr #63 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NOSVE2-NEXT: asr x9, x9, #1 +; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x10 +; CHECK-NOSVE2-NEXT: sub x10, x1, x2 +; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9 +; CHECK-NOSVE2-NEXT: add x9, x10, x10, lsr #63 +; CHECK-NOSVE2-NEXT: cmn x10, #1 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: asr x9, x9, #1 +; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p3.h, xzr, x9 +; CHECK-NOSVE2-NEXT: mov w9, w3 +; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10 +; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b +; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9 +; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.h +; CHECK-NOSVE2-NEXT: and x10, x10, #0xff +; CHECK-NOSVE2-NEXT: .LBB5_2: // %vector.body +; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] +; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1] +; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h +; CHECK-NOSVE2-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1] +; CHECK-NOSVE2-NEXT: add x8, x8, x10 +; CHECK-NOSVE2-NEXT: whilelo p1.h, x8, x9 +; CHECK-NOSVE2-NEXT: b.mi .LBB5_2 +; CHECK-NOSVE2-NEXT: .LBB5_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: ret +entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %c14 = ptrtoint ptr %c to i64 + %a15 = ptrtoint ptr %a to i64 + %b16 = ptrtoint ptr %b to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %sub.diff = sub i64 %a15, %c14 + %diff = sdiv i64 %sub.diff, 2 + %neg.compare = icmp slt i64 %sub.diff, -1 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + %sub.diff18 = sub i64 %b16, %c14 + %diff19 = sdiv i64 %sub.diff18, 2 + %neg.compare20 = icmp slt i64 %sub.diff18, -1 + %.splatinsert21 = insertelement poison, i1 %neg.compare20, i64 0 + %.splat22 = shufflevector %.splatinsert21, poison, zeroinitializer + %ptr.diff.lane.mask23 = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff19) + %active.lane.mask.alias24 = or %ptr.diff.lane.mask23, %.splat22 + %0 = and %active.lane.mask.alias, %active.lane.mask.alias24 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count) + %1 = zext %0 to + %2 = tail call i8 @llvm.vector.reduce.add.nxv8i8( %1) + %3 = zext i8 %2 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %4 = and %active.lane.mask, %0 + %5 = getelementptr inbounds i16, ptr %a, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv8i16.p0(ptr %5, i32 2, %4, poison) + %6 = getelementptr inbounds i16, ptr %b, i64 %index + %wide.masked.load25 = tail call @llvm.masked.load.nxv8i16.p0(ptr %6, i32 2, %4, poison) + %7 = add %wide.masked.load25, %wide.masked.load + %8 = getelementptr inbounds i16, ptr %c, i64 %index + tail call void @llvm.masked.store.nxv8i16.p0( %7, ptr %8, i32 2, %4) + %index.next = add i64 %index, %3 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count) + %9 = extractelement %active.lane.mask.next, i64 0 + br i1 %9, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define dso_local void @whilewr_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_multiple_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB6_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: whilewr p0.s, x0, x2 +; CHECK-NEXT: mov w9, w3 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: whilewr p1.s, x1, x2 +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-NEXT: whilelo p1.s, xzr, x9 +; CHECK-NEXT: cntp x10, p0, p0.s +; CHECK-NEXT: and x10, x10, #0xff +; CHECK-NEXT: .LBB6_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2] +; CHECK-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2] +; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: whilelo p1.s, x8, x9 +; CHECK-NEXT: b.mi .LBB6_2 +; CHECK-NEXT: .LBB6_3: // %for.cond.cleanup +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_multiple_32: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: cmp w3, #1 +; CHECK-NOSVE2-NEXT: b.lt .LBB6_3 +; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NOSVE2-NEXT: sub x9, x0, x2 +; CHECK-NOSVE2-NEXT: mov x8, xzr +; CHECK-NOSVE2-NEXT: add x10, x9, #3 +; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt +; CHECK-NOSVE2-NEXT: cmn x9, #3 +; CHECK-NOSVE2-NEXT: asr x9, x10, #2 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9 +; CHECK-NOSVE2-NEXT: sub x9, x1, x2 +; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10 +; CHECK-NOSVE2-NEXT: add x10, x9, #3 +; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt +; CHECK-NOSVE2-NEXT: cmn x9, #3 +; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: asr x10, x10, #2 +; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p3.s, xzr, x10 +; CHECK-NOSVE2-NEXT: whilelo p2.s, xzr, x9 +; CHECK-NOSVE2-NEXT: mov w9, w3 +; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b +; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9 +; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.s +; CHECK-NOSVE2-NEXT: and x10, x10, #0xff +; CHECK-NOSVE2-NEXT: .LBB6_2: // %vector.body +; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] +; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2] +; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s +; CHECK-NOSVE2-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2] +; CHECK-NOSVE2-NEXT: add x8, x8, x10 +; CHECK-NOSVE2-NEXT: whilelo p1.s, x8, x9 +; CHECK-NOSVE2-NEXT: b.mi .LBB6_2 +; CHECK-NOSVE2-NEXT: .LBB6_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: ret +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %c12 = ptrtoint ptr %c to i64 + %a13 = ptrtoint ptr %a to i64 + %b14 = ptrtoint ptr %b to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %sub.diff = sub i64 %a13, %c12 + %diff = sdiv i64 %sub.diff, 4 + %neg.compare = icmp slt i64 %sub.diff, -3 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + %sub.diff16 = sub i64 %b14, %c12 + %diff17 = sdiv i64 %sub.diff16, 4 + %neg.compare18 = icmp slt i64 %sub.diff16, -3 + %.splatinsert19 = insertelement poison, i1 %neg.compare18, i64 0 + %.splat20 = shufflevector %.splatinsert19, poison, zeroinitializer + %ptr.diff.lane.mask21 = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff17) + %active.lane.mask.alias22 = or %ptr.diff.lane.mask21, %.splat20 + %0 = and %active.lane.mask.alias, %active.lane.mask.alias22 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count) + %1 = zext %0 to + %2 = tail call i8 @llvm.vector.reduce.add.nxv4i8( %1) + %3 = zext i8 %2 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %4 = and %active.lane.mask, %0 + %5 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv4i32.p0(ptr %5, i32 4, %4, poison) + %6 = getelementptr inbounds i32, ptr %b, i64 %index + %wide.masked.load23 = tail call @llvm.masked.load.nxv4i32.p0(ptr %6, i32 4, %4, poison) + %7 = add %wide.masked.load23, %wide.masked.load + %8 = getelementptr inbounds i32, ptr %c, i64 %index + tail call void @llvm.masked.store.nxv4i32.p0( %7, ptr %8, i32 4, %4) + %index.next = add i64 %index, %3 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count) + %9 = extractelement %active.lane.mask.next, i64 0 + br i1 %9, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define dso_local void @whilewr_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_multiple_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB7_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: whilewr p0.d, x0, x2 +; CHECK-NEXT: mov w9, w3 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: whilewr p1.d, x1, x2 +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-NEXT: whilelo p1.d, xzr, x9 +; CHECK-NEXT: cntp x10, p0, p0.d +; CHECK-NEXT: and x10, x10, #0xff +; CHECK-NEXT: .LBB7_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3] +; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: whilelo p1.d, x8, x9 +; CHECK-NEXT: b.mi .LBB7_2 +; CHECK-NEXT: .LBB7_3: // %for.cond.cleanup +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_multiple_64: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: cmp w3, #1 +; CHECK-NOSVE2-NEXT: b.lt .LBB7_3 +; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NOSVE2-NEXT: sub x9, x0, x2 +; CHECK-NOSVE2-NEXT: mov x8, xzr +; CHECK-NOSVE2-NEXT: add x10, x9, #7 +; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt +; CHECK-NOSVE2-NEXT: cmn x9, #7 +; CHECK-NOSVE2-NEXT: asr x9, x10, #3 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9 +; CHECK-NOSVE2-NEXT: sub x9, x1, x2 +; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10 +; CHECK-NOSVE2-NEXT: add x10, x9, #7 +; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt +; CHECK-NOSVE2-NEXT: cmn x9, #7 +; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: asr x10, x10, #3 +; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p3.d, xzr, x10 +; CHECK-NOSVE2-NEXT: whilelo p2.d, xzr, x9 +; CHECK-NOSVE2-NEXT: mov w9, w3 +; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b +; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9 +; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.d +; CHECK-NOSVE2-NEXT: and x10, x10, #0xff +; CHECK-NOSVE2-NEXT: .LBB7_2: // %vector.body +; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] +; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d +; CHECK-NOSVE2-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3] +; CHECK-NOSVE2-NEXT: add x8, x8, x10 +; CHECK-NOSVE2-NEXT: whilelo p1.d, x8, x9 +; CHECK-NOSVE2-NEXT: b.mi .LBB7_2 +; CHECK-NOSVE2-NEXT: .LBB7_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: ret +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %c12 = ptrtoint ptr %c to i64 + %a13 = ptrtoint ptr %a to i64 + %b14 = ptrtoint ptr %b to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %sub.diff = sub i64 %a13, %c12 + %diff = sdiv i64 %sub.diff, 8 + %neg.compare = icmp slt i64 %sub.diff, -7 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + %sub.diff16 = sub i64 %b14, %c12 + %diff17 = sdiv i64 %sub.diff16, 8 + %neg.compare18 = icmp slt i64 %sub.diff16, -7 + %.splatinsert19 = insertelement poison, i1 %neg.compare18, i64 0 + %.splat20 = shufflevector %.splatinsert19, poison, zeroinitializer + %ptr.diff.lane.mask21 = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff17) + %active.lane.mask.alias22 = or %ptr.diff.lane.mask21, %.splat20 + %0 = and %active.lane.mask.alias, %active.lane.mask.alias22 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count) + %1 = zext %0 to + %2 = tail call i8 @llvm.vector.reduce.add.nxv2i8( %1) + %3 = zext i8 %2 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %4 = and %active.lane.mask, %0 + %5 = getelementptr inbounds i64, ptr %a, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv2i64.p0(ptr %5, i32 8, %4, poison) + %6 = getelementptr inbounds i64, ptr %b, i64 %index + %wide.masked.load23 = tail call @llvm.masked.load.nxv2i64.p0(ptr %6, i32 8, %4, poison) + %7 = add %wide.masked.load23, %wide.masked.load + %8 = getelementptr inbounds i64, ptr %c, i64 %index + tail call void @llvm.masked.store.nxv2i64.p0( %7, ptr %8, i32 8, %4) + %index.next = add i64 %index, %3 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count) + %9 = extractelement %active.lane.mask.next, i64 0 + br i1 %9, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +declare i64 @llvm.vscale.i64() #1 + +declare @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64) #1 + +declare @llvm.masked.load.nxv16i8.p0(ptr nocapture, i32 immarg, , ) #2 + +declare void @llvm.masked.store.nxv16i8.p0(, ptr nocapture, i32 immarg, ) #3 + +declare @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64) #1 + +declare @llvm.masked.load.nxv8i16.p0(ptr nocapture, i32 immarg, , ) #2 + +declare void @llvm.masked.store.nxv8i16.p0(, ptr nocapture, i32 immarg, ) #3 + +declare @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64) #1 + +declare @llvm.masked.load.nxv4i32.p0(ptr nocapture, i32 immarg, , ) #2 + +declare void @llvm.masked.store.nxv4i32.p0(, ptr nocapture, i32 immarg, ) #3 + +declare @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64) #1 + +declare @llvm.masked.load.nxv2i64.p0(ptr nocapture, i32 immarg, , ) #2 + +declare void @llvm.masked.store.nxv2i64.p0(, ptr nocapture, i32 immarg, ) #3 + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+jsconv,+lse,+neon,+outline-atomics,+pauth,+ras,+rcpc,+rdm,+sme,+sme2,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #3 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } From 54129dcb824821e8575e2b9e8005782259439dc4 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Mon, 29 Jul 2024 17:12:20 +0100 Subject: [PATCH 02/19] Add codegen test --- llvm/test/CodeGen/AArch64/whilewr.ll | 127 +++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/whilewr.ll diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll new file mode 100644 index 0000000000000..84855e3898360 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/whilewr.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve2 -o - | FileCheck %s +; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2 +define dso_local @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.b, x1, x2 +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_8: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: sub x8, x1, x2 +; CHECK-NOSVE2-NEXT: cmp x8, #0 +; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %c14 = ptrtoint ptr %c to i64 + %b15 = ptrtoint ptr %b to i64 + %sub.diff = sub i64 %b15, %c14 + %neg.compare = icmp slt i64 %sub.diff, 0 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} + +define dso_local @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.h, x1, x2 +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_16: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: sub x8, x1, x2 +; CHECK-NOSVE2-NEXT: cmn x8, #1 +; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 +; CHECK-NOSVE2-NEXT: asr x8, x8, #1 +; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9 +; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x8 +; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %b14 = ptrtoint ptr %b to i64 + %c15 = ptrtoint ptr %c to i64 + %sub.diff = sub i64 %b14, %c15 + %diff = sdiv i64 %sub.diff, 2 + %neg.compare = icmp slt i64 %sub.diff, -1 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} + +define dso_local @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.s, x1, x2 +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_32: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: sub x8, x1, x2 +; CHECK-NOSVE2-NEXT: add x9, x8, #3 +; CHECK-NOSVE2-NEXT: cmp x8, #0 +; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt +; CHECK-NOSVE2-NEXT: cmn x8, #3 +; CHECK-NOSVE2-NEXT: cset w8, lt +; CHECK-NOSVE2-NEXT: asr x9, x9, #2 +; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9 +; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %b12 = ptrtoint ptr %b to i64 + %c13 = ptrtoint ptr %c to i64 + %sub.diff = sub i64 %b12, %c13 + %diff = sdiv i64 %sub.diff, 4 + %neg.compare = icmp slt i64 %sub.diff, -3 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} + +define dso_local @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.d, x1, x2 +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_64: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: sub x8, x1, x2 +; CHECK-NOSVE2-NEXT: add x9, x8, #7 +; CHECK-NOSVE2-NEXT: cmp x8, #0 +; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt +; CHECK-NOSVE2-NEXT: cmn x8, #7 +; CHECK-NOSVE2-NEXT: cset w8, lt +; CHECK-NOSVE2-NEXT: asr x9, x9, #3 +; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9 +; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %b12 = ptrtoint ptr %b to i64 + %c13 = ptrtoint ptr %c to i64 + %sub.diff = sub i64 %b12, %c13 + %diff = sdiv i64 %sub.diff, 8 + %neg.compare = icmp slt i64 %sub.diff, -7 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} From a667dcde8ff60f3d1fb8dc47e9ebdc2f3fea8f82 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Mon, 29 Jul 2024 17:13:01 +0100 Subject: [PATCH 03/19] format --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c2e9ba6291855..2a4daf779f1e1 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13785,9 +13785,9 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { } /// Try to lower the construction of a pointer alias mask to a WHILEWR. -/// The mask's enabled lanes represent the elements that will not overlap across one loop iteration. -/// This tries to match: -/// or (splat (setcc_lt (sub ptrA, ptrB), -(element_size - 1))), +/// The mask's enabled lanes represent the elements that will not overlap across +/// one loop iteration. This tries to match: or (splat (setcc_lt (sub ptrA, +/// ptrB), -(element_size - 1))), /// (get_active_lane_mask 0, (div (sub ptrA, ptrB), element_size)) SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { if (!DAG.getSubtarget().hasSVE2()) @@ -13825,7 +13825,8 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { (EltSize != 1 && LaneMask.getOperand(2).getOpcode() != ISD::SRA)) return SDValue(); - // An alias mask for i8 elements omits the division because it would just divide by 1 + // An alias mask for i8 elements omits the division because it would just + // divide by 1 if (EltSize > 1) { auto DiffDiv = LaneMask.getOperand(2); auto DiffDivConst = dyn_cast(DiffDiv.getOperand(1)); From 9899a41fb85c918b81d8b931527cf61ea5292fdf Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 30 Jul 2024 13:48:49 +0100 Subject: [PATCH 04/19] Use Log2_64 --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 2a4daf779f1e1..b1faf021de95d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -94,7 +94,6 @@ #include #include #include -#include #include #include #include @@ -13830,7 +13829,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { if (EltSize > 1) { auto DiffDiv = LaneMask.getOperand(2); auto DiffDivConst = dyn_cast(DiffDiv.getOperand(1)); - if (!DiffDivConst || DiffDivConst->getZExtValue() != std::log2(EltSize)) + if (!DiffDivConst || DiffDivConst->getZExtValue() != Log2_64(EltSize)) return SDValue(); } else if (LaneMask.getOperand(2) != Diff) return SDValue(); From 5f739bdccfd6fc2577959678855c194fb9e8242f Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 30 Jul 2024 13:49:49 +0100 Subject: [PATCH 05/19] Fix comment formatting --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index b1faf021de95d..08776f26432d6 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13785,8 +13785,8 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { /// Try to lower the construction of a pointer alias mask to a WHILEWR. /// The mask's enabled lanes represent the elements that will not overlap across -/// one loop iteration. This tries to match: or (splat (setcc_lt (sub ptrA, -/// ptrB), -(element_size - 1))), +/// one loop iteration. This tries to match: +/// or (splat (setcc_lt (sub ptrA, ptrB), -(element_size - 1))), /// (get_active_lane_mask 0, (div (sub ptrA, ptrB), element_size)) SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { if (!DAG.getSubtarget().hasSVE2()) From e69f3be14a100f54032de69dd3dce615072dc6f2 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 30 Jul 2024 13:51:11 +0100 Subject: [PATCH 06/19] Use SDValue instead of auto --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 08776f26432d6..dd9bc8151dacb 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13791,15 +13791,15 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { if (!DAG.getSubtarget().hasSVE2()) return SDValue(); - auto LaneMask = Op.getOperand(0); - auto Splat = Op.getOperand(1); + SDValue LaneMask = Op.getOperand(0); + SDValue Splat = Op.getOperand(1); if (LaneMask.getOpcode() != ISD::INTRINSIC_WO_CHAIN || LaneMask.getConstantOperandVal(0) != Intrinsic::get_active_lane_mask || Splat.getOpcode() != ISD::SPLAT_VECTOR) return SDValue(); - auto Cmp = Splat.getOperand(0); + SDValue Cmp = Splat.getOperand(0); if (Cmp.getOpcode() != ISD::SETCC) return SDValue(); @@ -13815,7 +13815,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { if (!isPowerOf2_64(EltSize) || EltSize > 64) return SDValue(); - auto Diff = Cmp.getOperand(0); + SDValue Diff = Cmp.getOperand(0); if (Diff.getOpcode() != ISD::SUB || Diff.getValueType() != MVT::i64) return SDValue(); @@ -13827,15 +13827,15 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { // An alias mask for i8 elements omits the division because it would just // divide by 1 if (EltSize > 1) { - auto DiffDiv = LaneMask.getOperand(2); + SDValue DiffDiv = LaneMask.getOperand(2); auto DiffDivConst = dyn_cast(DiffDiv.getOperand(1)); if (!DiffDivConst || DiffDivConst->getZExtValue() != Log2_64(EltSize)) return SDValue(); } else if (LaneMask.getOperand(2) != Diff) return SDValue(); - auto StorePtr = Diff.getOperand(0); - auto ReadPtr = Diff.getOperand(1); + SDValue StorePtr = Diff.getOperand(0); + SDValue ReadPtr = Diff.getOperand(1); unsigned IntrinsicID = 0; switch (EltSize) { @@ -13856,9 +13856,8 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { } SDLoc DL(Op); SDValue ID = DAG.getConstant(IntrinsicID, DL, MVT::i32); - auto N = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), ID, + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), ID, StorePtr, ReadPtr); - return N; } SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, From 653d6d2d13b9b2f593199aa1086d6f19c667b4e0 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 30 Jul 2024 13:55:15 +0100 Subject: [PATCH 07/19] Check for OR operands being the other way around --- .../Target/AArch64/AArch64ISelLowering.cpp | 3 ++ llvm/test/CodeGen/AArch64/whilewr.ll | 28 +++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index dd9bc8151dacb..a321a622b4ae4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13794,6 +13794,9 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { SDValue LaneMask = Op.getOperand(0); SDValue Splat = Op.getOperand(1); + if (Splat.getOpcode() != ISD::SPLAT_VECTOR) + std::swap(LaneMask, Splat); + if (LaneMask.getOpcode() != ISD::INTRINSIC_WO_CHAIN || LaneMask.getConstantOperandVal(0) != Intrinsic::get_active_lane_mask || Splat.getOpcode() != ISD::SPLAT_VECTOR) diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll index 84855e3898360..5b9a9775e6597 100644 --- a/llvm/test/CodeGen/AArch64/whilewr.ll +++ b/llvm/test/CodeGen/AArch64/whilewr.ll @@ -29,6 +29,34 @@ entry: ret %active.lane.mask.alias } +define dso_local @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_commutative: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.b, x1, x2 +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_commutative: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: sub x8, x1, x2 +; CHECK-NOSVE2-NEXT: cmp x8, #0 +; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %c14 = ptrtoint ptr %c to i64 + %b15 = ptrtoint ptr %b to i64 + %sub.diff = sub i64 %b15, %c14 + %neg.compare = icmp slt i64 %sub.diff, 0 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff) + %active.lane.mask.alias = or %.splat, %ptr.diff.lane.mask + ret %active.lane.mask.alias +} + define dso_local @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_16: ; CHECK: // %bb.0: // %entry From 487ff3ca120420fd184dbb7e980332aaab302389 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 30 Jul 2024 13:55:42 +0100 Subject: [PATCH 08/19] Replace dyn_cast and assert by cast --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a321a622b4ae4..42d529e9dfdbe 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13806,8 +13806,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { if (Cmp.getOpcode() != ISD::SETCC) return SDValue(); - CondCodeSDNode *Cond = dyn_cast(Cmp.getOperand(2)); - assert(Cond && "SETCC doesn't have a condition code"); + CondCodeSDNode *Cond = cast(Cmp.getOperand(2)); auto ComparatorConst = dyn_cast(Cmp.getOperand(1)); if (!ComparatorConst || ComparatorConst->getSExtValue() > 0 || From ab22dd1b375628abdd94d7de6a05a1b970a3ad6b Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 30 Jul 2024 14:29:35 +0100 Subject: [PATCH 09/19] Fix eltsize comparison and add test for it --- .../Target/AArch64/AArch64ISelLowering.cpp | 2 +- llvm/test/CodeGen/AArch64/whilewr.ll | 53 +++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 42d529e9dfdbe..dedbea0c5d0b0 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13814,7 +13814,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { return SDValue(); unsigned CompValue = std::abs(ComparatorConst->getSExtValue()); unsigned EltSize = CompValue + 1; - if (!isPowerOf2_64(EltSize) || EltSize > 64) + if (!isPowerOf2_64(EltSize) || EltSize > 8) return SDValue(); SDValue Diff = Cmp.getOperand(0); diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll index 5b9a9775e6597..93ed825cf60f3 100644 --- a/llvm/test/CodeGen/AArch64/whilewr.ll +++ b/llvm/test/CodeGen/AArch64/whilewr.ll @@ -153,3 +153,56 @@ entry: %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat ret %active.lane.mask.alias } + +define dso_local @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: no_whilewr_128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub x8, x1, x2 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x9, x8, #15 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: csel x9, x9, x8, lt +; CHECK-NEXT: cmn x8, #15 +; CHECK-NEXT: asr x9, x9, #4 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: mov z1.d, x9 +; CHECK-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NEXT: cmphi p0.d, p0/z, z1.d, z0.d +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: no_whilewr_128: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: sub x8, x1, x2 +; CHECK-NOSVE2-NEXT: index z0.d, #0, #1 +; CHECK-NOSVE2-NEXT: ptrue p0.d +; CHECK-NOSVE2-NEXT: add x9, x8, #15 +; CHECK-NOSVE2-NEXT: cmp x8, #0 +; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt +; CHECK-NOSVE2-NEXT: cmn x8, #15 +; CHECK-NOSVE2-NEXT: asr x9, x9, #4 +; CHECK-NOSVE2-NEXT: cset w8, lt +; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NOSVE2-NEXT: mov z1.d, x9 +; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NOSVE2-NEXT: cmphi p0.d, p0/z, z1.d, z0.d +; CHECK-NOSVE2-NEXT: punpklo p1.h, p1.b +; CHECK-NOSVE2-NEXT: punpklo p0.h, p0.b +; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %b12 = ptrtoint ptr %b to i64 + %c13 = ptrtoint ptr %c to i64 + %sub.diff = sub i64 %b12, %c13 + %diff = sdiv i64 %sub.diff, 16 + %neg.compare = icmp slt i64 %sub.diff, -15 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv1i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} From 0c378a360ce3838cbb5e5a006a21eda7274f6872 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 30 Jul 2024 16:47:23 +0100 Subject: [PATCH 10/19] Remove O3 from test --- llvm/test/CodeGen/AArch64/whilewr.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll index 93ed825cf60f3..bb8c43a20cb71 100644 --- a/llvm/test/CodeGen/AArch64/whilewr.ll +++ b/llvm/test/CodeGen/AArch64/whilewr.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve2 -o - | FileCheck %s -; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2 +; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s +; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2 define dso_local @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_8: ; CHECK: // %bb.0: // %entry From f114dc7413173a12cd65520cf1f50a89058b6052 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 30 Jul 2024 16:47:56 +0100 Subject: [PATCH 11/19] Remove dso_local --- llvm/test/CodeGen/AArch64/whilewr.ll | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll index bb8c43a20cb71..18a1dcf0078b7 100644 --- a/llvm/test/CodeGen/AArch64/whilewr.ll +++ b/llvm/test/CodeGen/AArch64/whilewr.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s ; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2 -define dso_local @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +define @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: whilewr p0.b, x1, x2 @@ -29,7 +29,7 @@ entry: ret %active.lane.mask.alias } -define dso_local @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +define @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_commutative: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: whilewr p0.b, x1, x2 @@ -57,7 +57,7 @@ entry: ret %active.lane.mask.alias } -define dso_local @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +define @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: whilewr p0.h, x1, x2 @@ -88,7 +88,7 @@ entry: ret %active.lane.mask.alias } -define dso_local @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +define @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: whilewr p0.s, x1, x2 @@ -121,7 +121,7 @@ entry: ret %active.lane.mask.alias } -define dso_local @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +define @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: whilewr p0.d, x1, x2 @@ -154,7 +154,7 @@ entry: ret %active.lane.mask.alias } -define dso_local @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +define @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: no_whilewr_128: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sub x8, x1, x2 From bbcb9359ff1dbbde365ca22cd05a5baa3d2ced24 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 30 Jul 2024 16:53:06 +0100 Subject: [PATCH 12/19] Move loop tests to whilewr.ll --- .../Target/AArch64/AArch64ISelLowering.cpp | 2 +- llvm/test/CodeGen/AArch64/whilewr.ll | 877 +++++++++++++++++ .../LoopVectorize/AArch64/alias_mask.ll | 884 ------------------ 3 files changed, 878 insertions(+), 885 deletions(-) delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index dedbea0c5d0b0..7c35f46fb08e4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13859,7 +13859,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); SDValue ID = DAG.getConstant(IntrinsicID, DL, MVT::i32); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), ID, - StorePtr, ReadPtr); + StorePtr, ReadPtr); } SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll index 18a1dcf0078b7..67959112705a1 100644 --- a/llvm/test/CodeGen/AArch64/whilewr.ll +++ b/llvm/test/CodeGen/AArch64/whilewr.ll @@ -206,3 +206,880 @@ entry: %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat ret %active.lane.mask.alias } + +define dso_local void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_loop_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB6_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: whilewr p0.b, x1, x2 +; CHECK-NEXT: mov w9, w3 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NEXT: cntp x10, p0, p0.b +; CHECK-NEXT: and x10, x10, #0xff +; CHECK-NEXT: .LBB6_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8] +; CHECK-NEXT: add z0.b, z1.b, z0.b +; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8] +; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: whilelo p1.b, x8, x9 +; CHECK-NEXT: b.mi .LBB6_2 +; CHECK-NEXT: .LBB6_3: // %for.cond.cleanup +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_loop_8: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: cmp w3, #1 +; CHECK-NOSVE2-NEXT: b.lt .LBB6_3 +; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NOSVE2-NEXT: sub x9, x1, x2 +; CHECK-NOSVE2-NEXT: mov x8, xzr +; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9 +; CHECK-NOSVE2-NEXT: sbfx x9, x10, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NOSVE2-NEXT: mov w9, w3 +; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b +; CHECK-NOSVE2-NEXT: and x10, x10, #0xff +; CHECK-NOSVE2-NEXT: .LBB6_2: // %vector.body +; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8] +; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8] +; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b +; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8] +; CHECK-NOSVE2-NEXT: add x8, x8, x10 +; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9 +; CHECK-NOSVE2-NEXT: b.mi .LBB6_2 +; CHECK-NOSVE2-NEXT: .LBB6_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: ret +entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %c14 = ptrtoint ptr %c to i64 + %b15 = ptrtoint ptr %b to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %sub.diff = sub i64 %b15, %c14 + %neg.compare = icmp slt i64 %sub.diff, 0 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count) + %0 = zext %active.lane.mask.alias to + %1 = tail call i8 @llvm.vector.reduce.add.nxv16i8( %0) + %2 = zext i8 %1 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %3 = and %active.lane.mask, %active.lane.mask.alias + %4 = getelementptr inbounds i8, ptr %a, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv16i8.p0(ptr %4, i32 1, %3, poison) + %5 = getelementptr inbounds i8, ptr %b, i64 %index + %wide.masked.load16 = tail call @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, %3, poison) + %6 = add %wide.masked.load16, %wide.masked.load + %7 = getelementptr inbounds i8, ptr %c, i64 %index + tail call void @llvm.masked.store.nxv16i8.p0( %6, ptr %7, i32 1, %3) + %index.next = add i64 %index, %2 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count) + %8 = extractelement %active.lane.mask.next, i64 0 + br i1 %8, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define dso_local void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_loop_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB7_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: mov w8, w3 +; CHECK-NEXT: whilewr p1.h, x1, x2 +; CHECK-NEXT: mov x9, xzr +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: .LBB7_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1] +; CHECK-NEXT: add z0.h, z1.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x2, x9, lsl #1] +; CHECK-NEXT: inch x9 +; CHECK-NEXT: whilelo p0.h, x9, x8 +; CHECK-NEXT: b.mi .LBB7_2 +; CHECK-NEXT: .LBB7_3: // %for.cond.cleanup +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_loop_16: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: cmp w3, #1 +; CHECK-NOSVE2-NEXT: b.lt .LBB7_3 +; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NOSVE2-NEXT: mov w9, w3 +; CHECK-NOSVE2-NEXT: sub x10, x1, x2 +; CHECK-NOSVE2-NEXT: mov x8, xzr +; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9 +; CHECK-NOSVE2-NEXT: cmn x10, #1 +; CHECK-NOSVE2-NEXT: add x10, x10, x10, lsr #63 +; CHECK-NOSVE2-NEXT: cset w11, lt +; CHECK-NOSVE2-NEXT: sbfx x11, x11, #0, #1 +; CHECK-NOSVE2-NEXT: asr x10, x10, #1 +; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x11 +; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10 +; CHECK-NOSVE2-NEXT: cnth x10 +; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b +; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: .LBB7_2: // %vector.body +; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] +; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h +; CHECK-NOSVE2-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] +; CHECK-NOSVE2-NEXT: add x8, x8, x10 +; CHECK-NOSVE2-NEXT: whilelo p0.h, x8, x9 +; CHECK-NOSVE2-NEXT: b.mi .LBB7_2 +; CHECK-NOSVE2-NEXT: .LBB7_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: ret +entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %b14 = ptrtoint ptr %b to i64 + %c15 = ptrtoint ptr %c to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 3 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count) + %sub.diff = sub i64 %b14, %c15 + %diff = sdiv i64 %sub.diff, 2 + %neg.compare = icmp slt i64 %sub.diff, -1 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + %2 = and %active.lane.mask.alias, %active.lane.mask.entry + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %3 = getelementptr inbounds i16, ptr %a, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv8i16.p0(ptr %3, i32 2, %active.lane.mask, poison) + %4 = getelementptr inbounds i16, ptr %b, i64 %index + %wide.masked.load16 = tail call @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, %active.lane.mask, poison) + %5 = add %wide.masked.load16, %wide.masked.load + %6 = getelementptr inbounds i16, ptr %c, i64 %index + tail call void @llvm.masked.store.nxv8i16.p0( %5, ptr %6, i32 2, %active.lane.mask) + %index.next = add i64 %index, %1 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count) + %7 = extractelement %active.lane.mask.next, i64 0 + br i1 %7, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define dso_local void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_loop_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB8_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: mov w8, w3 +; CHECK-NEXT: whilewr p1.s, x1, x2 +; CHECK-NEXT: mov x9, xzr +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: .LBB8_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x2, x9, lsl #2] +; CHECK-NEXT: incw x9 +; CHECK-NEXT: whilelo p0.s, x9, x8 +; CHECK-NEXT: b.mi .LBB8_2 +; CHECK-NEXT: .LBB8_3: // %for.cond.cleanup +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_loop_32: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: cmp w3, #1 +; CHECK-NOSVE2-NEXT: b.lt .LBB8_3 +; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NOSVE2-NEXT: mov w9, w3 +; CHECK-NOSVE2-NEXT: sub x10, x1, x2 +; CHECK-NOSVE2-NEXT: mov x8, xzr +; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9 +; CHECK-NOSVE2-NEXT: add x11, x10, #3 +; CHECK-NOSVE2-NEXT: cmp x10, #0 +; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt +; CHECK-NOSVE2-NEXT: cmn x10, #3 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: asr x11, x11, #2 +; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p2.s, xzr, x11 +; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10 +; CHECK-NOSVE2-NEXT: cntw x10 +; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b +; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: .LBB8_2: // %vector.body +; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] +; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s +; CHECK-NOSVE2-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] +; CHECK-NOSVE2-NEXT: add x8, x8, x10 +; CHECK-NOSVE2-NEXT: whilelo p0.s, x8, x9 +; CHECK-NOSVE2-NEXT: b.mi .LBB8_2 +; CHECK-NOSVE2-NEXT: .LBB8_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: ret +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %b12 = ptrtoint ptr %b to i64 + %c13 = ptrtoint ptr %c to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 2 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count) + %sub.diff = sub i64 %b12, %c13 + %diff = sdiv i64 %sub.diff, 4 + %neg.compare = icmp slt i64 %sub.diff, -3 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + %2 = and %active.lane.mask.alias, %active.lane.mask.entry + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %3 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, %active.lane.mask, poison) + %4 = getelementptr inbounds i32, ptr %b, i64 %index + %wide.masked.load14 = tail call @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, %active.lane.mask, poison) + %5 = add %wide.masked.load14, %wide.masked.load + %6 = getelementptr inbounds i32, ptr %c, i64 %index + tail call void @llvm.masked.store.nxv4i32.p0( %5, ptr %6, i32 4, %active.lane.mask) + %index.next = add i64 %index, %1 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count) + %7 = extractelement %active.lane.mask.next, i64 0 + br i1 %7, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define dso_local void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_loop_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB9_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: mov w8, w3 +; CHECK-NEXT: whilewr p1.d, x1, x2 +; CHECK-NEXT: mov x9, xzr +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: .LBB9_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3] +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x2, x9, lsl #3] +; CHECK-NEXT: incd x9 +; CHECK-NEXT: whilelo p0.d, x9, x8 +; CHECK-NEXT: b.mi .LBB9_2 +; CHECK-NEXT: .LBB9_3: // %for.cond.cleanup +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_loop_64: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: cmp w3, #1 +; CHECK-NOSVE2-NEXT: b.lt .LBB9_3 +; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NOSVE2-NEXT: mov w9, w3 +; CHECK-NOSVE2-NEXT: sub x10, x1, x2 +; CHECK-NOSVE2-NEXT: mov x8, xzr +; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9 +; CHECK-NOSVE2-NEXT: add x11, x10, #7 +; CHECK-NOSVE2-NEXT: cmp x10, #0 +; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt +; CHECK-NOSVE2-NEXT: cmn x10, #7 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: asr x11, x11, #3 +; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p2.d, xzr, x11 +; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10 +; CHECK-NOSVE2-NEXT: cntd x10 +; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b +; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: .LBB9_2: // %vector.body +; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] +; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d +; CHECK-NOSVE2-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] +; CHECK-NOSVE2-NEXT: add x8, x8, x10 +; CHECK-NOSVE2-NEXT: whilelo p0.d, x8, x9 +; CHECK-NOSVE2-NEXT: b.mi .LBB9_2 +; CHECK-NOSVE2-NEXT: .LBB9_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: ret +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %b12 = ptrtoint ptr %b to i64 + %c13 = ptrtoint ptr %c to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 1 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count) + %sub.diff = sub i64 %b12, %c13 + %diff = sdiv i64 %sub.diff, 8 + %neg.compare = icmp slt i64 %sub.diff, -7 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + %2 = and %active.lane.mask.alias, %active.lane.mask.entry + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %3 = getelementptr inbounds i64, ptr %a, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv2i64.p0(ptr %3, i32 8, %active.lane.mask, poison) + %4 = getelementptr inbounds i64, ptr %b, i64 %index + %wide.masked.load14 = tail call @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, %active.lane.mask, poison) + %5 = add %wide.masked.load14, %wide.masked.load + %6 = getelementptr inbounds i64, ptr %c, i64 %index + tail call void @llvm.masked.store.nxv2i64.p0( %5, ptr %6, i32 8, %active.lane.mask) + %index.next = add i64 %index, %1 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count) + %7 = extractelement %active.lane.mask.next, i64 0 + br i1 %7, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define dso_local void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_loop_multiple_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB10_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: whilewr p0.b, x0, x2 +; CHECK-NEXT: mov w9, w3 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: whilewr p1.b, x1, x2 +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NEXT: cntp x10, p0, p0.b +; CHECK-NEXT: and x10, x10, #0xff +; CHECK-NEXT: .LBB10_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8] +; CHECK-NEXT: add z0.b, z1.b, z0.b +; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8] +; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: whilelo p1.b, x8, x9 +; CHECK-NEXT: b.mi .LBB10_2 +; CHECK-NEXT: .LBB10_3: // %for.cond.cleanup +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_8: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: cmp w3, #1 +; CHECK-NOSVE2-NEXT: b.lt .LBB10_3 +; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NOSVE2-NEXT: sub x9, x0, x2 +; CHECK-NOSVE2-NEXT: mov x8, xzr +; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9 +; CHECK-NOSVE2-NEXT: sub x9, x1, x2 +; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x10 +; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: whilelo p3.b, xzr, x9 +; CHECK-NOSVE2-NEXT: mov w9, w3 +; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NOSVE2-NEXT: whilelo p2.b, xzr, x10 +; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b +; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b +; CHECK-NOSVE2-NEXT: and x10, x10, #0xff +; CHECK-NOSVE2-NEXT: .LBB10_2: // %vector.body +; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8] +; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8] +; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b +; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8] +; CHECK-NOSVE2-NEXT: add x8, x8, x10 +; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9 +; CHECK-NOSVE2-NEXT: b.mi .LBB10_2 +; CHECK-NOSVE2-NEXT: .LBB10_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: ret +entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %c14 = ptrtoint ptr %c to i64 + %a15 = ptrtoint ptr %a to i64 + %b16 = ptrtoint ptr %b to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %sub.diff = sub i64 %a15, %c14 + %neg.compare = icmp slt i64 %sub.diff, 0 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + %sub.diff18 = sub i64 %b16, %c14 + %neg.compare20 = icmp slt i64 %sub.diff18, 0 + %.splatinsert21 = insertelement poison, i1 %neg.compare20, i64 0 + %.splat22 = shufflevector %.splatinsert21, poison, zeroinitializer + %ptr.diff.lane.mask23 = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff18) + %active.lane.mask.alias24 = or %ptr.diff.lane.mask23, %.splat22 + %0 = and %active.lane.mask.alias, %active.lane.mask.alias24 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count) + %1 = zext %0 to + %2 = tail call i8 @llvm.vector.reduce.add.nxv16i8( %1) + %3 = zext i8 %2 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %4 = and %active.lane.mask, %0 + %5 = getelementptr inbounds i8, ptr %a, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, %4, poison) + %6 = getelementptr inbounds i8, ptr %b, i64 %index + %wide.masked.load25 = tail call @llvm.masked.load.nxv16i8.p0(ptr %6, i32 1, %4, poison) + %7 = add %wide.masked.load25, %wide.masked.load + %8 = getelementptr inbounds i8, ptr %c, i64 %index + tail call void @llvm.masked.store.nxv16i8.p0( %7, ptr %8, i32 1, %4) + %index.next = add i64 %index, %3 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count) + %9 = extractelement %active.lane.mask.next, i64 0 + br i1 %9, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define dso_local void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_loop_multiple_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB11_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: whilewr p0.h, x0, x2 +; CHECK-NEXT: mov w9, w3 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: whilewr p1.h, x1, x2 +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-NEXT: whilelo p1.h, xzr, x9 +; CHECK-NEXT: cntp x10, p0, p0.h +; CHECK-NEXT: and x10, x10, #0xff +; CHECK-NEXT: .LBB11_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1] +; CHECK-NEXT: add z0.h, z1.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1] +; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: whilelo p1.h, x8, x9 +; CHECK-NEXT: b.mi .LBB11_2 +; CHECK-NEXT: .LBB11_3: // %for.cond.cleanup +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_16: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: cmp w3, #1 +; CHECK-NOSVE2-NEXT: b.lt .LBB11_3 +; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NOSVE2-NEXT: sub x9, x0, x2 +; CHECK-NOSVE2-NEXT: mov x8, xzr +; CHECK-NOSVE2-NEXT: cmn x9, #1 +; CHECK-NOSVE2-NEXT: add x9, x9, x9, lsr #63 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NOSVE2-NEXT: asr x9, x9, #1 +; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x10 +; CHECK-NOSVE2-NEXT: sub x10, x1, x2 +; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9 +; CHECK-NOSVE2-NEXT: add x9, x10, x10, lsr #63 +; CHECK-NOSVE2-NEXT: cmn x10, #1 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: asr x9, x9, #1 +; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p3.h, xzr, x9 +; CHECK-NOSVE2-NEXT: mov w9, w3 +; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10 +; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b +; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9 +; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.h +; CHECK-NOSVE2-NEXT: and x10, x10, #0xff +; CHECK-NOSVE2-NEXT: .LBB11_2: // %vector.body +; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] +; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1] +; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h +; CHECK-NOSVE2-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1] +; CHECK-NOSVE2-NEXT: add x8, x8, x10 +; CHECK-NOSVE2-NEXT: whilelo p1.h, x8, x9 +; CHECK-NOSVE2-NEXT: b.mi .LBB11_2 +; CHECK-NOSVE2-NEXT: .LBB11_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: ret +entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %c14 = ptrtoint ptr %c to i64 + %a15 = ptrtoint ptr %a to i64 + %b16 = ptrtoint ptr %b to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %sub.diff = sub i64 %a15, %c14 + %diff = sdiv i64 %sub.diff, 2 + %neg.compare = icmp slt i64 %sub.diff, -1 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + %sub.diff18 = sub i64 %b16, %c14 + %diff19 = sdiv i64 %sub.diff18, 2 + %neg.compare20 = icmp slt i64 %sub.diff18, -1 + %.splatinsert21 = insertelement poison, i1 %neg.compare20, i64 0 + %.splat22 = shufflevector %.splatinsert21, poison, zeroinitializer + %ptr.diff.lane.mask23 = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff19) + %active.lane.mask.alias24 = or %ptr.diff.lane.mask23, %.splat22 + %0 = and %active.lane.mask.alias, %active.lane.mask.alias24 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count) + %1 = zext %0 to + %2 = tail call i8 @llvm.vector.reduce.add.nxv8i8( %1) + %3 = zext i8 %2 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %4 = and %active.lane.mask, %0 + %5 = getelementptr inbounds i16, ptr %a, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv8i16.p0(ptr %5, i32 2, %4, poison) + %6 = getelementptr inbounds i16, ptr %b, i64 %index + %wide.masked.load25 = tail call @llvm.masked.load.nxv8i16.p0(ptr %6, i32 2, %4, poison) + %7 = add %wide.masked.load25, %wide.masked.load + %8 = getelementptr inbounds i16, ptr %c, i64 %index + tail call void @llvm.masked.store.nxv8i16.p0( %7, ptr %8, i32 2, %4) + %index.next = add i64 %index, %3 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count) + %9 = extractelement %active.lane.mask.next, i64 0 + br i1 %9, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define dso_local void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_loop_multiple_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB12_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: whilewr p0.s, x0, x2 +; CHECK-NEXT: mov w9, w3 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: whilewr p1.s, x1, x2 +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-NEXT: whilelo p1.s, xzr, x9 +; CHECK-NEXT: cntp x10, p0, p0.s +; CHECK-NEXT: and x10, x10, #0xff +; CHECK-NEXT: .LBB12_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2] +; CHECK-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2] +; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: whilelo p1.s, x8, x9 +; CHECK-NEXT: b.mi .LBB12_2 +; CHECK-NEXT: .LBB12_3: // %for.cond.cleanup +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_32: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: cmp w3, #1 +; CHECK-NOSVE2-NEXT: b.lt .LBB12_3 +; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NOSVE2-NEXT: sub x9, x0, x2 +; CHECK-NOSVE2-NEXT: mov x8, xzr +; CHECK-NOSVE2-NEXT: add x10, x9, #3 +; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt +; CHECK-NOSVE2-NEXT: cmn x9, #3 +; CHECK-NOSVE2-NEXT: asr x9, x10, #2 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9 +; CHECK-NOSVE2-NEXT: sub x9, x1, x2 +; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10 +; CHECK-NOSVE2-NEXT: add x10, x9, #3 +; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt +; CHECK-NOSVE2-NEXT: cmn x9, #3 +; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: asr x10, x10, #2 +; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p3.s, xzr, x10 +; CHECK-NOSVE2-NEXT: whilelo p2.s, xzr, x9 +; CHECK-NOSVE2-NEXT: mov w9, w3 +; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b +; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9 +; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.s +; CHECK-NOSVE2-NEXT: and x10, x10, #0xff +; CHECK-NOSVE2-NEXT: .LBB12_2: // %vector.body +; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] +; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2] +; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s +; CHECK-NOSVE2-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2] +; CHECK-NOSVE2-NEXT: add x8, x8, x10 +; CHECK-NOSVE2-NEXT: whilelo p1.s, x8, x9 +; CHECK-NOSVE2-NEXT: b.mi .LBB12_2 +; CHECK-NOSVE2-NEXT: .LBB12_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: ret +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %c12 = ptrtoint ptr %c to i64 + %a13 = ptrtoint ptr %a to i64 + %b14 = ptrtoint ptr %b to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %sub.diff = sub i64 %a13, %c12 + %diff = sdiv i64 %sub.diff, 4 + %neg.compare = icmp slt i64 %sub.diff, -3 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + %sub.diff16 = sub i64 %b14, %c12 + %diff17 = sdiv i64 %sub.diff16, 4 + %neg.compare18 = icmp slt i64 %sub.diff16, -3 + %.splatinsert19 = insertelement poison, i1 %neg.compare18, i64 0 + %.splat20 = shufflevector %.splatinsert19, poison, zeroinitializer + %ptr.diff.lane.mask21 = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff17) + %active.lane.mask.alias22 = or %ptr.diff.lane.mask21, %.splat20 + %0 = and %active.lane.mask.alias, %active.lane.mask.alias22 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count) + %1 = zext %0 to + %2 = tail call i8 @llvm.vector.reduce.add.nxv4i8( %1) + %3 = zext i8 %2 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %4 = and %active.lane.mask, %0 + %5 = getelementptr inbounds i32, ptr %a, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv4i32.p0(ptr %5, i32 4, %4, poison) + %6 = getelementptr inbounds i32, ptr %b, i64 %index + %wide.masked.load23 = tail call @llvm.masked.load.nxv4i32.p0(ptr %6, i32 4, %4, poison) + %7 = add %wide.masked.load23, %wide.masked.load + %8 = getelementptr inbounds i32, ptr %c, i64 %index + tail call void @llvm.masked.store.nxv4i32.p0( %7, ptr %8, i32 4, %4) + %index.next = add i64 %index, %3 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count) + %9 = extractelement %active.lane.mask.next, i64 0 + br i1 %9, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define dso_local void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilewr_loop_multiple_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w3, #1 +; CHECK-NEXT: b.lt .LBB13_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: whilewr p0.d, x0, x2 +; CHECK-NEXT: mov w9, w3 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: whilewr p1.d, x1, x2 +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-NEXT: whilelo p1.d, xzr, x9 +; CHECK-NEXT: cntp x10, p0, p0.d +; CHECK-NEXT: and x10, x10, #0xff +; CHECK-NEXT: .LBB13_2: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3] +; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: whilelo p1.d, x8, x9 +; CHECK-NEXT: b.mi .LBB13_2 +; CHECK-NEXT: .LBB13_3: // %for.cond.cleanup +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_64: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: cmp w3, #1 +; CHECK-NOSVE2-NEXT: b.lt .LBB13_3 +; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NOSVE2-NEXT: sub x9, x0, x2 +; CHECK-NOSVE2-NEXT: mov x8, xzr +; CHECK-NOSVE2-NEXT: add x10, x9, #7 +; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt +; CHECK-NOSVE2-NEXT: cmn x9, #7 +; CHECK-NOSVE2-NEXT: asr x9, x10, #3 +; CHECK-NOSVE2-NEXT: cset w10, lt +; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9 +; CHECK-NOSVE2-NEXT: sub x9, x1, x2 +; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10 +; CHECK-NOSVE2-NEXT: add x10, x9, #7 +; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt +; CHECK-NOSVE2-NEXT: cmn x9, #7 +; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: asr x10, x10, #3 +; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p3.d, xzr, x10 +; CHECK-NOSVE2-NEXT: whilelo p2.d, xzr, x9 +; CHECK-NOSVE2-NEXT: mov w9, w3 +; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b +; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9 +; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.d +; CHECK-NOSVE2-NEXT: and x10, x10, #0xff +; CHECK-NOSVE2-NEXT: .LBB13_2: // %vector.body +; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] +; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d +; CHECK-NOSVE2-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3] +; CHECK-NOSVE2-NEXT: add x8, x8, x10 +; CHECK-NOSVE2-NEXT: whilelo p1.d, x8, x9 +; CHECK-NOSVE2-NEXT: b.mi .LBB13_2 +; CHECK-NOSVE2-NEXT: .LBB13_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: ret +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %c12 = ptrtoint ptr %c to i64 + %a13 = ptrtoint ptr %a to i64 + %b14 = ptrtoint ptr %b to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %sub.diff = sub i64 %a13, %c12 + %diff = sdiv i64 %sub.diff, 8 + %neg.compare = icmp slt i64 %sub.diff, -7 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + %sub.diff16 = sub i64 %b14, %c12 + %diff17 = sdiv i64 %sub.diff16, 8 + %neg.compare18 = icmp slt i64 %sub.diff16, -7 + %.splatinsert19 = insertelement poison, i1 %neg.compare18, i64 0 + %.splat20 = shufflevector %.splatinsert19, poison, zeroinitializer + %ptr.diff.lane.mask21 = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff17) + %active.lane.mask.alias22 = or %ptr.diff.lane.mask21, %.splat20 + %0 = and %active.lane.mask.alias, %active.lane.mask.alias22 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count) + %1 = zext %0 to + %2 = tail call i8 @llvm.vector.reduce.add.nxv2i8( %1) + %3 = zext i8 %2 to i64 + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %4 = and %active.lane.mask, %0 + %5 = getelementptr inbounds i64, ptr %a, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv2i64.p0(ptr %5, i32 8, %4, poison) + %6 = getelementptr inbounds i64, ptr %b, i64 %index + %wide.masked.load23 = tail call @llvm.masked.load.nxv2i64.p0(ptr %6, i32 8, %4, poison) + %7 = add %wide.masked.load23, %wide.masked.load + %8 = getelementptr inbounds i64, ptr %c, i64 %index + tail call void @llvm.masked.store.nxv2i64.p0( %7, ptr %8, i32 8, %4) + %index.next = add i64 %index, %3 + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count) + %9 = extractelement %active.lane.mask.next, i64 0 + br i1 %9, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +declare i64 @llvm.vscale.i64() + +declare @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64) + +declare @llvm.masked.load.nxv16i8.p0(ptr nocapture, i32 immarg, , ) + +declare void @llvm.masked.store.nxv16i8.p0(, ptr nocapture, i32 immarg, ) + +declare @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64) + +declare @llvm.masked.load.nxv8i16.p0(ptr nocapture, i32 immarg, , ) + +declare void @llvm.masked.store.nxv8i16.p0(, ptr nocapture, i32 immarg, ) + +declare @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64) + +declare @llvm.masked.load.nxv4i32.p0(ptr nocapture, i32 immarg, , ) + +declare void @llvm.masked.store.nxv4i32.p0(, ptr nocapture, i32 immarg, ) + +declare @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64) + +declare @llvm.masked.load.nxv2i64.p0(ptr nocapture, i32 immarg, , ) + +declare void @llvm.masked.store.nxv2i64.p0(, ptr nocapture, i32 immarg, ) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll deleted file mode 100644 index 3662efa41c151..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll +++ /dev/null @@ -1,884 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve2 -o - | FileCheck %s -; RUN: llc %s -mtriple=aarch64-linux-gnu -O3 -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2 -define dso_local void @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB0_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: whilewr p0.b, x1, x2 -; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: whilelo p1.b, xzr, x9 -; CHECK-NEXT: cntp x10, p0, p0.b -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB0_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8] -; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8] -; CHECK-NEXT: add z0.b, z1.b, z0.b -; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8] -; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p1.b, x8, x9 -; CHECK-NEXT: b.mi .LBB0_2 -; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_8: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB0_3 -; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NOSVE2-NEXT: sub x9, x1, x2 -; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: cmp x9, #0 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9 -; CHECK-NOSVE2-NEXT: sbfx x9, x10, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 -; CHECK-NOSVE2-NEXT: mov w9, w3 -; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 -; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b -; CHECK-NOSVE2-NEXT: and x10, x10, #0xff -; CHECK-NOSVE2-NEXT: .LBB0_2: // %vector.body -; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8] -; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8] -; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b -; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8] -; CHECK-NOSVE2-NEXT: add x8, x8, x10 -; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB0_2 -; CHECK-NOSVE2-NEXT: .LBB0_3: // %for.cond.cleanup -; CHECK-NOSVE2-NEXT: ret -entry: - %cmp11 = icmp sgt i32 %n, 0 - br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %c14 = ptrtoint ptr %c to i64 - %b15 = ptrtoint ptr %b to i64 - %wide.trip.count = zext nneg i32 %n to i64 - %sub.diff = sub i64 %b15, %c14 - %neg.compare = icmp slt i64 %sub.diff, 0 - %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 - %.splat = shufflevector %.splatinsert, poison, zeroinitializer - %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff) - %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat - %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count) - %0 = zext %active.lane.mask.alias to - %1 = tail call i8 @llvm.vector.reduce.add.nxv16i8( %0) - %2 = zext i8 %1 to i64 - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] - %3 = and %active.lane.mask, %active.lane.mask.alias - %4 = getelementptr inbounds i8, ptr %a, i64 %index - %wide.masked.load = tail call @llvm.masked.load.nxv16i8.p0(ptr %4, i32 1, %3, poison) - %5 = getelementptr inbounds i8, ptr %b, i64 %index - %wide.masked.load16 = tail call @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, %3, poison) - %6 = add %wide.masked.load16, %wide.masked.load - %7 = getelementptr inbounds i8, ptr %c, i64 %index - tail call void @llvm.masked.store.nxv16i8.p0( %6, ptr %7, i32 1, %3) - %index.next = add i64 %index, %2 - %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count) - %8 = extractelement %active.lane.mask.next, i64 0 - br i1 %8, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -define dso_local void @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB1_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w8, w3 -; CHECK-NEXT: whilewr p1.h, x1, x2 -; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: whilelo p0.h, xzr, x8 -; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NEXT: .LBB1_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1] -; CHECK-NEXT: add z0.h, z1.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x2, x9, lsl #1] -; CHECK-NEXT: inch x9 -; CHECK-NEXT: whilelo p0.h, x9, x8 -; CHECK-NEXT: b.mi .LBB1_2 -; CHECK-NEXT: .LBB1_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_16: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB1_3 -; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NOSVE2-NEXT: mov w9, w3 -; CHECK-NOSVE2-NEXT: sub x10, x1, x2 -; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9 -; CHECK-NOSVE2-NEXT: cmn x10, #1 -; CHECK-NOSVE2-NEXT: add x10, x10, x10, lsr #63 -; CHECK-NOSVE2-NEXT: cset w11, lt -; CHECK-NOSVE2-NEXT: sbfx x11, x11, #0, #1 -; CHECK-NOSVE2-NEXT: asr x10, x10, #1 -; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x11 -; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10 -; CHECK-NOSVE2-NEXT: cnth x10 -; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: .LBB1_2: // %vector.body -; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] -; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h -; CHECK-NOSVE2-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] -; CHECK-NOSVE2-NEXT: add x8, x8, x10 -; CHECK-NOSVE2-NEXT: whilelo p0.h, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB1_2 -; CHECK-NOSVE2-NEXT: .LBB1_3: // %for.cond.cleanup -; CHECK-NOSVE2-NEXT: ret -entry: - %cmp11 = icmp sgt i32 %n, 0 - br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %b14 = ptrtoint ptr %b to i64 - %c15 = ptrtoint ptr %c to i64 - %wide.trip.count = zext nneg i32 %n to i64 - %0 = tail call i64 @llvm.vscale.i64() - %1 = shl nuw nsw i64 %0, 3 - %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count) - %sub.diff = sub i64 %b14, %c15 - %diff = sdiv i64 %sub.diff, 2 - %neg.compare = icmp slt i64 %sub.diff, -1 - %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 - %.splat = shufflevector %.splatinsert, poison, zeroinitializer - %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff) - %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat - %2 = and %active.lane.mask.alias, %active.lane.mask.entry - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] - %3 = getelementptr inbounds i16, ptr %a, i64 %index - %wide.masked.load = tail call @llvm.masked.load.nxv8i16.p0(ptr %3, i32 2, %active.lane.mask, poison) - %4 = getelementptr inbounds i16, ptr %b, i64 %index - %wide.masked.load16 = tail call @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, %active.lane.mask, poison) - %5 = add %wide.masked.load16, %wide.masked.load - %6 = getelementptr inbounds i16, ptr %c, i64 %index - tail call void @llvm.masked.store.nxv8i16.p0( %5, ptr %6, i32 2, %active.lane.mask) - %index.next = add i64 %index, %1 - %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count) - %7 = extractelement %active.lane.mask.next, i64 0 - br i1 %7, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -define dso_local void @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB2_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w8, w3 -; CHECK-NEXT: whilewr p1.s, x1, x2 -; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: whilelo p0.s, xzr, x8 -; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NEXT: .LBB2_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2] -; CHECK-NEXT: add z0.s, z1.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x2, x9, lsl #2] -; CHECK-NEXT: incw x9 -; CHECK-NEXT: whilelo p0.s, x9, x8 -; CHECK-NEXT: b.mi .LBB2_2 -; CHECK-NEXT: .LBB2_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_32: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB2_3 -; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NOSVE2-NEXT: mov w9, w3 -; CHECK-NOSVE2-NEXT: sub x10, x1, x2 -; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9 -; CHECK-NOSVE2-NEXT: add x11, x10, #3 -; CHECK-NOSVE2-NEXT: cmp x10, #0 -; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt -; CHECK-NOSVE2-NEXT: cmn x10, #3 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: asr x11, x11, #2 -; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p2.s, xzr, x11 -; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10 -; CHECK-NOSVE2-NEXT: cntw x10 -; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: .LBB2_2: // %vector.body -; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] -; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s -; CHECK-NOSVE2-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] -; CHECK-NOSVE2-NEXT: add x8, x8, x10 -; CHECK-NOSVE2-NEXT: whilelo p0.s, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB2_2 -; CHECK-NOSVE2-NEXT: .LBB2_3: // %for.cond.cleanup -; CHECK-NOSVE2-NEXT: ret -entry: - %cmp9 = icmp sgt i32 %n, 0 - br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %b12 = ptrtoint ptr %b to i64 - %c13 = ptrtoint ptr %c to i64 - %wide.trip.count = zext nneg i32 %n to i64 - %0 = tail call i64 @llvm.vscale.i64() - %1 = shl nuw nsw i64 %0, 2 - %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count) - %sub.diff = sub i64 %b12, %c13 - %diff = sdiv i64 %sub.diff, 4 - %neg.compare = icmp slt i64 %sub.diff, -3 - %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 - %.splat = shufflevector %.splatinsert, poison, zeroinitializer - %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) - %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat - %2 = and %active.lane.mask.alias, %active.lane.mask.entry - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] - %3 = getelementptr inbounds i32, ptr %a, i64 %index - %wide.masked.load = tail call @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, %active.lane.mask, poison) - %4 = getelementptr inbounds i32, ptr %b, i64 %index - %wide.masked.load14 = tail call @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, %active.lane.mask, poison) - %5 = add %wide.masked.load14, %wide.masked.load - %6 = getelementptr inbounds i32, ptr %c, i64 %index - tail call void @llvm.masked.store.nxv4i32.p0( %5, ptr %6, i32 4, %active.lane.mask) - %index.next = add i64 %index, %1 - %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count) - %7 = extractelement %active.lane.mask.next, i64 0 - br i1 %7, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -define dso_local void @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB3_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w8, w3 -; CHECK-NEXT: whilewr p1.d, x1, x2 -; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: whilelo p0.d, xzr, x8 -; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NEXT: .LBB3_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3] -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: st1d { z0.d }, p0, [x2, x9, lsl #3] -; CHECK-NEXT: incd x9 -; CHECK-NEXT: whilelo p0.d, x9, x8 -; CHECK-NEXT: b.mi .LBB3_2 -; CHECK-NEXT: .LBB3_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_64: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB3_3 -; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NOSVE2-NEXT: mov w9, w3 -; CHECK-NOSVE2-NEXT: sub x10, x1, x2 -; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9 -; CHECK-NOSVE2-NEXT: add x11, x10, #7 -; CHECK-NOSVE2-NEXT: cmp x10, #0 -; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt -; CHECK-NOSVE2-NEXT: cmn x10, #7 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: asr x11, x11, #3 -; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p2.d, xzr, x11 -; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10 -; CHECK-NOSVE2-NEXT: cntd x10 -; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: .LBB3_2: // %vector.body -; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] -; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d -; CHECK-NOSVE2-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] -; CHECK-NOSVE2-NEXT: add x8, x8, x10 -; CHECK-NOSVE2-NEXT: whilelo p0.d, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB3_2 -; CHECK-NOSVE2-NEXT: .LBB3_3: // %for.cond.cleanup -; CHECK-NOSVE2-NEXT: ret -entry: - %cmp9 = icmp sgt i32 %n, 0 - br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %b12 = ptrtoint ptr %b to i64 - %c13 = ptrtoint ptr %c to i64 - %wide.trip.count = zext nneg i32 %n to i64 - %0 = tail call i64 @llvm.vscale.i64() - %1 = shl nuw nsw i64 %0, 1 - %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count) - %sub.diff = sub i64 %b12, %c13 - %diff = sdiv i64 %sub.diff, 8 - %neg.compare = icmp slt i64 %sub.diff, -7 - %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 - %.splat = shufflevector %.splatinsert, poison, zeroinitializer - %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) - %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat - %2 = and %active.lane.mask.alias, %active.lane.mask.entry - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] - %3 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.masked.load = tail call @llvm.masked.load.nxv2i64.p0(ptr %3, i32 8, %active.lane.mask, poison) - %4 = getelementptr inbounds i64, ptr %b, i64 %index - %wide.masked.load14 = tail call @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, %active.lane.mask, poison) - %5 = add %wide.masked.load14, %wide.masked.load - %6 = getelementptr inbounds i64, ptr %c, i64 %index - tail call void @llvm.masked.store.nxv2i64.p0( %5, ptr %6, i32 8, %active.lane.mask) - %index.next = add i64 %index, %1 - %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count) - %7 = extractelement %active.lane.mask.next, i64 0 - br i1 %7, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -define dso_local void @whilewr_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_multiple_8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB4_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: whilewr p0.b, x0, x2 -; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: whilewr p1.b, x1, x2 -; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NEXT: whilelo p1.b, xzr, x9 -; CHECK-NEXT: cntp x10, p0, p0.b -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB4_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8] -; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8] -; CHECK-NEXT: add z0.b, z1.b, z0.b -; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8] -; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p1.b, x8, x9 -; CHECK-NEXT: b.mi .LBB4_2 -; CHECK-NEXT: .LBB4_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_multiple_8: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB4_3 -; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NOSVE2-NEXT: sub x9, x0, x2 -; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: cmp x9, #0 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9 -; CHECK-NOSVE2-NEXT: sub x9, x1, x2 -; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x10 -; CHECK-NOSVE2-NEXT: cmp x9, #0 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: whilelo p3.b, xzr, x9 -; CHECK-NOSVE2-NEXT: mov w9, w3 -; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NOSVE2-NEXT: whilelo p2.b, xzr, x10 -; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 -; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b -; CHECK-NOSVE2-NEXT: and x10, x10, #0xff -; CHECK-NOSVE2-NEXT: .LBB4_2: // %vector.body -; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8] -; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8] -; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b -; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8] -; CHECK-NOSVE2-NEXT: add x8, x8, x10 -; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB4_2 -; CHECK-NOSVE2-NEXT: .LBB4_3: // %for.cond.cleanup -; CHECK-NOSVE2-NEXT: ret -entry: - %cmp11 = icmp sgt i32 %n, 0 - br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %c14 = ptrtoint ptr %c to i64 - %a15 = ptrtoint ptr %a to i64 - %b16 = ptrtoint ptr %b to i64 - %wide.trip.count = zext nneg i32 %n to i64 - %sub.diff = sub i64 %a15, %c14 - %neg.compare = icmp slt i64 %sub.diff, 0 - %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 - %.splat = shufflevector %.splatinsert, poison, zeroinitializer - %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff) - %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat - %sub.diff18 = sub i64 %b16, %c14 - %neg.compare20 = icmp slt i64 %sub.diff18, 0 - %.splatinsert21 = insertelement poison, i1 %neg.compare20, i64 0 - %.splat22 = shufflevector %.splatinsert21, poison, zeroinitializer - %ptr.diff.lane.mask23 = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff18) - %active.lane.mask.alias24 = or %ptr.diff.lane.mask23, %.splat22 - %0 = and %active.lane.mask.alias, %active.lane.mask.alias24 - %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count) - %1 = zext %0 to - %2 = tail call i8 @llvm.vector.reduce.add.nxv16i8( %1) - %3 = zext i8 %2 to i64 - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] - %4 = and %active.lane.mask, %0 - %5 = getelementptr inbounds i8, ptr %a, i64 %index - %wide.masked.load = tail call @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, %4, poison) - %6 = getelementptr inbounds i8, ptr %b, i64 %index - %wide.masked.load25 = tail call @llvm.masked.load.nxv16i8.p0(ptr %6, i32 1, %4, poison) - %7 = add %wide.masked.load25, %wide.masked.load - %8 = getelementptr inbounds i8, ptr %c, i64 %index - tail call void @llvm.masked.store.nxv16i8.p0( %7, ptr %8, i32 1, %4) - %index.next = add i64 %index, %3 - %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count) - %9 = extractelement %active.lane.mask.next, i64 0 - br i1 %9, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -define dso_local void @whilewr_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_multiple_16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB5_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: whilewr p0.h, x0, x2 -; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: whilewr p1.h, x1, x2 -; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NEXT: whilelo p1.h, xzr, x9 -; CHECK-NEXT: cntp x10, p0, p0.h -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB5_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] -; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1] -; CHECK-NEXT: add z0.h, z1.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1] -; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p1.h, x8, x9 -; CHECK-NEXT: b.mi .LBB5_2 -; CHECK-NEXT: .LBB5_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_multiple_16: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB5_3 -; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NOSVE2-NEXT: sub x9, x0, x2 -; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: cmn x9, #1 -; CHECK-NOSVE2-NEXT: add x9, x9, x9, lsr #63 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NOSVE2-NEXT: asr x9, x9, #1 -; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x10 -; CHECK-NOSVE2-NEXT: sub x10, x1, x2 -; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9 -; CHECK-NOSVE2-NEXT: add x9, x10, x10, lsr #63 -; CHECK-NOSVE2-NEXT: cmn x10, #1 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: asr x9, x9, #1 -; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b -; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p3.h, xzr, x9 -; CHECK-NOSVE2-NEXT: mov w9, w3 -; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10 -; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9 -; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.h -; CHECK-NOSVE2-NEXT: and x10, x10, #0xff -; CHECK-NOSVE2-NEXT: .LBB5_2: // %vector.body -; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] -; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1] -; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h -; CHECK-NOSVE2-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1] -; CHECK-NOSVE2-NEXT: add x8, x8, x10 -; CHECK-NOSVE2-NEXT: whilelo p1.h, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB5_2 -; CHECK-NOSVE2-NEXT: .LBB5_3: // %for.cond.cleanup -; CHECK-NOSVE2-NEXT: ret -entry: - %cmp11 = icmp sgt i32 %n, 0 - br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %c14 = ptrtoint ptr %c to i64 - %a15 = ptrtoint ptr %a to i64 - %b16 = ptrtoint ptr %b to i64 - %wide.trip.count = zext nneg i32 %n to i64 - %sub.diff = sub i64 %a15, %c14 - %diff = sdiv i64 %sub.diff, 2 - %neg.compare = icmp slt i64 %sub.diff, -1 - %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 - %.splat = shufflevector %.splatinsert, poison, zeroinitializer - %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff) - %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat - %sub.diff18 = sub i64 %b16, %c14 - %diff19 = sdiv i64 %sub.diff18, 2 - %neg.compare20 = icmp slt i64 %sub.diff18, -1 - %.splatinsert21 = insertelement poison, i1 %neg.compare20, i64 0 - %.splat22 = shufflevector %.splatinsert21, poison, zeroinitializer - %ptr.diff.lane.mask23 = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff19) - %active.lane.mask.alias24 = or %ptr.diff.lane.mask23, %.splat22 - %0 = and %active.lane.mask.alias, %active.lane.mask.alias24 - %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count) - %1 = zext %0 to - %2 = tail call i8 @llvm.vector.reduce.add.nxv8i8( %1) - %3 = zext i8 %2 to i64 - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] - %4 = and %active.lane.mask, %0 - %5 = getelementptr inbounds i16, ptr %a, i64 %index - %wide.masked.load = tail call @llvm.masked.load.nxv8i16.p0(ptr %5, i32 2, %4, poison) - %6 = getelementptr inbounds i16, ptr %b, i64 %index - %wide.masked.load25 = tail call @llvm.masked.load.nxv8i16.p0(ptr %6, i32 2, %4, poison) - %7 = add %wide.masked.load25, %wide.masked.load - %8 = getelementptr inbounds i16, ptr %c, i64 %index - tail call void @llvm.masked.store.nxv8i16.p0( %7, ptr %8, i32 2, %4) - %index.next = add i64 %index, %3 - %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count) - %9 = extractelement %active.lane.mask.next, i64 0 - br i1 %9, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -define dso_local void @whilewr_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_multiple_32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB6_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: whilewr p0.s, x0, x2 -; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: whilewr p1.s, x1, x2 -; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NEXT: whilelo p1.s, xzr, x9 -; CHECK-NEXT: cntp x10, p0, p0.s -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB6_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2] -; CHECK-NEXT: add z0.s, z1.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2] -; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p1.s, x8, x9 -; CHECK-NEXT: b.mi .LBB6_2 -; CHECK-NEXT: .LBB6_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_multiple_32: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB6_3 -; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NOSVE2-NEXT: sub x9, x0, x2 -; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: add x10, x9, #3 -; CHECK-NOSVE2-NEXT: cmp x9, #0 -; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt -; CHECK-NOSVE2-NEXT: cmn x9, #3 -; CHECK-NOSVE2-NEXT: asr x9, x10, #2 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9 -; CHECK-NOSVE2-NEXT: sub x9, x1, x2 -; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10 -; CHECK-NOSVE2-NEXT: add x10, x9, #3 -; CHECK-NOSVE2-NEXT: cmp x9, #0 -; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt -; CHECK-NOSVE2-NEXT: cmn x9, #3 -; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NOSVE2-NEXT: cset w9, lt -; CHECK-NOSVE2-NEXT: asr x10, x10, #2 -; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p3.s, xzr, x10 -; CHECK-NOSVE2-NEXT: whilelo p2.s, xzr, x9 -; CHECK-NOSVE2-NEXT: mov w9, w3 -; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9 -; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.s -; CHECK-NOSVE2-NEXT: and x10, x10, #0xff -; CHECK-NOSVE2-NEXT: .LBB6_2: // %vector.body -; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] -; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2] -; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s -; CHECK-NOSVE2-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2] -; CHECK-NOSVE2-NEXT: add x8, x8, x10 -; CHECK-NOSVE2-NEXT: whilelo p1.s, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB6_2 -; CHECK-NOSVE2-NEXT: .LBB6_3: // %for.cond.cleanup -; CHECK-NOSVE2-NEXT: ret -entry: - %cmp9 = icmp sgt i32 %n, 0 - br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %c12 = ptrtoint ptr %c to i64 - %a13 = ptrtoint ptr %a to i64 - %b14 = ptrtoint ptr %b to i64 - %wide.trip.count = zext nneg i32 %n to i64 - %sub.diff = sub i64 %a13, %c12 - %diff = sdiv i64 %sub.diff, 4 - %neg.compare = icmp slt i64 %sub.diff, -3 - %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 - %.splat = shufflevector %.splatinsert, poison, zeroinitializer - %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) - %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat - %sub.diff16 = sub i64 %b14, %c12 - %diff17 = sdiv i64 %sub.diff16, 4 - %neg.compare18 = icmp slt i64 %sub.diff16, -3 - %.splatinsert19 = insertelement poison, i1 %neg.compare18, i64 0 - %.splat20 = shufflevector %.splatinsert19, poison, zeroinitializer - %ptr.diff.lane.mask21 = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff17) - %active.lane.mask.alias22 = or %ptr.diff.lane.mask21, %.splat20 - %0 = and %active.lane.mask.alias, %active.lane.mask.alias22 - %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count) - %1 = zext %0 to - %2 = tail call i8 @llvm.vector.reduce.add.nxv4i8( %1) - %3 = zext i8 %2 to i64 - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] - %4 = and %active.lane.mask, %0 - %5 = getelementptr inbounds i32, ptr %a, i64 %index - %wide.masked.load = tail call @llvm.masked.load.nxv4i32.p0(ptr %5, i32 4, %4, poison) - %6 = getelementptr inbounds i32, ptr %b, i64 %index - %wide.masked.load23 = tail call @llvm.masked.load.nxv4i32.p0(ptr %6, i32 4, %4, poison) - %7 = add %wide.masked.load23, %wide.masked.load - %8 = getelementptr inbounds i32, ptr %c, i64 %index - tail call void @llvm.masked.store.nxv4i32.p0( %7, ptr %8, i32 4, %4) - %index.next = add i64 %index, %3 - %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count) - %9 = extractelement %active.lane.mask.next, i64 0 - br i1 %9, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -define dso_local void @whilewr_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_multiple_64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB7_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: whilewr p0.d, x0, x2 -; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: whilewr p1.d, x1, x2 -; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NEXT: whilelo p1.d, xzr, x9 -; CHECK-NEXT: cntp x10, p0, p0.d -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB7_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3] -; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p1.d, x8, x9 -; CHECK-NEXT: b.mi .LBB7_2 -; CHECK-NEXT: .LBB7_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_multiple_64: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB7_3 -; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NOSVE2-NEXT: sub x9, x0, x2 -; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: add x10, x9, #7 -; CHECK-NOSVE2-NEXT: cmp x9, #0 -; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt -; CHECK-NOSVE2-NEXT: cmn x9, #7 -; CHECK-NOSVE2-NEXT: asr x9, x10, #3 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9 -; CHECK-NOSVE2-NEXT: sub x9, x1, x2 -; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10 -; CHECK-NOSVE2-NEXT: add x10, x9, #7 -; CHECK-NOSVE2-NEXT: cmp x9, #0 -; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt -; CHECK-NOSVE2-NEXT: cmn x9, #7 -; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NOSVE2-NEXT: cset w9, lt -; CHECK-NOSVE2-NEXT: asr x10, x10, #3 -; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p3.d, xzr, x10 -; CHECK-NOSVE2-NEXT: whilelo p2.d, xzr, x9 -; CHECK-NOSVE2-NEXT: mov w9, w3 -; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9 -; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.d -; CHECK-NOSVE2-NEXT: and x10, x10, #0xff -; CHECK-NOSVE2-NEXT: .LBB7_2: // %vector.body -; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] -; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] -; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d -; CHECK-NOSVE2-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3] -; CHECK-NOSVE2-NEXT: add x8, x8, x10 -; CHECK-NOSVE2-NEXT: whilelo p1.d, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB7_2 -; CHECK-NOSVE2-NEXT: .LBB7_3: // %for.cond.cleanup -; CHECK-NOSVE2-NEXT: ret -entry: - %cmp9 = icmp sgt i32 %n, 0 - br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %c12 = ptrtoint ptr %c to i64 - %a13 = ptrtoint ptr %a to i64 - %b14 = ptrtoint ptr %b to i64 - %wide.trip.count = zext nneg i32 %n to i64 - %sub.diff = sub i64 %a13, %c12 - %diff = sdiv i64 %sub.diff, 8 - %neg.compare = icmp slt i64 %sub.diff, -7 - %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 - %.splat = shufflevector %.splatinsert, poison, zeroinitializer - %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) - %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat - %sub.diff16 = sub i64 %b14, %c12 - %diff17 = sdiv i64 %sub.diff16, 8 - %neg.compare18 = icmp slt i64 %sub.diff16, -7 - %.splatinsert19 = insertelement poison, i1 %neg.compare18, i64 0 - %.splat20 = shufflevector %.splatinsert19, poison, zeroinitializer - %ptr.diff.lane.mask21 = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff17) - %active.lane.mask.alias22 = or %ptr.diff.lane.mask21, %.splat20 - %0 = and %active.lane.mask.alias, %active.lane.mask.alias22 - %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count) - %1 = zext %0 to - %2 = tail call i8 @llvm.vector.reduce.add.nxv2i8( %1) - %3 = zext i8 %2 to i64 - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] - %4 = and %active.lane.mask, %0 - %5 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.masked.load = tail call @llvm.masked.load.nxv2i64.p0(ptr %5, i32 8, %4, poison) - %6 = getelementptr inbounds i64, ptr %b, i64 %index - %wide.masked.load23 = tail call @llvm.masked.load.nxv2i64.p0(ptr %6, i32 8, %4, poison) - %7 = add %wide.masked.load23, %wide.masked.load - %8 = getelementptr inbounds i64, ptr %c, i64 %index - tail call void @llvm.masked.store.nxv2i64.p0( %7, ptr %8, i32 8, %4) - %index.next = add i64 %index, %3 - %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count) - %9 = extractelement %active.lane.mask.next, i64 0 - br i1 %9, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -declare i64 @llvm.vscale.i64() #1 - -declare @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64) #1 - -declare @llvm.masked.load.nxv16i8.p0(ptr nocapture, i32 immarg, , ) #2 - -declare void @llvm.masked.store.nxv16i8.p0(, ptr nocapture, i32 immarg, ) #3 - -declare @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64) #1 - -declare @llvm.masked.load.nxv8i16.p0(ptr nocapture, i32 immarg, , ) #2 - -declare void @llvm.masked.store.nxv8i16.p0(, ptr nocapture, i32 immarg, ) #3 - -declare @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64) #1 - -declare @llvm.masked.load.nxv4i32.p0(ptr nocapture, i32 immarg, , ) #2 - -declare void @llvm.masked.store.nxv4i32.p0(, ptr nocapture, i32 immarg, ) #3 - -declare @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64) #1 - -declare @llvm.masked.load.nxv2i64.p0(ptr nocapture, i32 immarg, , ) #2 - -declare void @llvm.masked.store.nxv2i64.p0(, ptr nocapture, i32 immarg, ) #3 - -attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+jsconv,+lse,+neon,+outline-atomics,+pauth,+ras,+rcpc,+rdm,+sme,+sme2,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" } -attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } -attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } -attributes #3 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } From 6c700e7cac641f17ed853526900a0416d7169ff8 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Wed, 31 Jul 2024 14:27:32 +0100 Subject: [PATCH 13/19] Check the divide's first operand --- .../Target/AArch64/AArch64ISelLowering.cpp | 33 +++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7c35f46fb08e4..8814da78762ef 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13826,13 +13826,42 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { (EltSize != 1 && LaneMask.getOperand(2).getOpcode() != ISD::SRA)) return SDValue(); - // An alias mask for i8 elements omits the division because it would just - // divide by 1 + // The number of elements that alias is calculated by dividing the positive difference between the pointers by the element size. + // An alias mask for i8 elements omits the division because it would just divide by 1 if (EltSize > 1) { SDValue DiffDiv = LaneMask.getOperand(2); auto DiffDivConst = dyn_cast(DiffDiv.getOperand(1)); if (!DiffDivConst || DiffDivConst->getZExtValue() != Log2_64(EltSize)) return SDValue(); + if (EltSize > 2) { + // When masking i32 or i64 elements, the positive value of the possibly-negative difference comes from a select of the difference if it's positive, otherwise the difference plus the element size if it's negative: + // pos_diff = diff < 0 ? (diff + 7) : diff + SDValue Select = DiffDiv.getOperand(0); + // Make sure the difference is being compared by the select + if (Select.getOpcode() != ISD::SELECT_CC || Select.getOperand(3) != Diff) + return SDValue(); + // Make sure it's checking if the difference is less than 0 + if (auto *SelectConst = dyn_cast(Select.getOperand(1)); !SelectConst || SelectConst->getZExtValue() != 0 || cast(Select.getOperand(4))->get() != ISD::CondCode::SETLT) + return SDValue(); + // An add creates a positive value from the negative difference + SDValue Add = Select.getOperand(2); + if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff) + return SDValue(); + if (auto *AddConst = dyn_cast(Add.getOperand(1)); !AddConst || AddConst->getZExtValue() != EltSize - 1) + return SDValue(); + } else { + // When masking i16 elements, this positive value comes from adding the difference's sign bit to the difference itself. This is equivalent to the 32 bit and 64 bit case: + // pos_diff = diff + sign_bit (diff) + SDValue Add = DiffDiv.getOperand(0); + if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff) + return SDValue(); + // A logical right shift by 63 extracts the sign bit from the difference value + SDValue Shift = Add.getOperand(1); + if (Shift.getOpcode() != ISD::SRL || Shift.getOperand(0) != Diff) + return SDValue(); + if (auto *ShiftConst = dyn_cast(Shift.getOperand(1)); !ShiftConst || ShiftConst->getZExtValue() != 63) + return SDValue(); + } } else if (LaneMask.getOperand(2) != Diff) return SDValue(); From 207c3bc79406481b603ffb421097456a42875df5 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Wed, 31 Jul 2024 14:38:57 +0100 Subject: [PATCH 14/19] Format --- .../Target/AArch64/AArch64ISelLowering.cpp | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8814da78762ef..588bd3e54882a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13826,40 +13826,49 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { (EltSize != 1 && LaneMask.getOperand(2).getOpcode() != ISD::SRA)) return SDValue(); - // The number of elements that alias is calculated by dividing the positive difference between the pointers by the element size. - // An alias mask for i8 elements omits the division because it would just divide by 1 + // The number of elements that alias is calculated by dividing the positive + // difference between the pointers by the element size. An alias mask for i8 + // elements omits the division because it would just divide by 1 if (EltSize > 1) { SDValue DiffDiv = LaneMask.getOperand(2); auto DiffDivConst = dyn_cast(DiffDiv.getOperand(1)); if (!DiffDivConst || DiffDivConst->getZExtValue() != Log2_64(EltSize)) return SDValue(); if (EltSize > 2) { - // When masking i32 or i64 elements, the positive value of the possibly-negative difference comes from a select of the difference if it's positive, otherwise the difference plus the element size if it's negative: - // pos_diff = diff < 0 ? (diff + 7) : diff + // When masking i32 or i64 elements, the positive value of the + // possibly-negative difference comes from a select of the difference if + // it's positive, otherwise the difference plus the element size if it's + // negative: pos_diff = diff < 0 ? (diff + 7) : diff SDValue Select = DiffDiv.getOperand(0); // Make sure the difference is being compared by the select if (Select.getOpcode() != ISD::SELECT_CC || Select.getOperand(3) != Diff) return SDValue(); // Make sure it's checking if the difference is less than 0 - if (auto *SelectConst = dyn_cast(Select.getOperand(1)); !SelectConst || SelectConst->getZExtValue() != 0 || cast(Select.getOperand(4))->get() != ISD::CondCode::SETLT) + if (auto *SelectConst = dyn_cast(Select.getOperand(1)); + !SelectConst || SelectConst->getZExtValue() != 0 || + cast(Select.getOperand(4))->get() != + ISD::CondCode::SETLT) return SDValue(); // An add creates a positive value from the negative difference SDValue Add = Select.getOperand(2); if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff) return SDValue(); - if (auto *AddConst = dyn_cast(Add.getOperand(1)); !AddConst || AddConst->getZExtValue() != EltSize - 1) + if (auto *AddConst = dyn_cast(Add.getOperand(1)); + !AddConst || AddConst->getZExtValue() != EltSize - 1) return SDValue(); } else { - // When masking i16 elements, this positive value comes from adding the difference's sign bit to the difference itself. This is equivalent to the 32 bit and 64 bit case: - // pos_diff = diff + sign_bit (diff) + // When masking i16 elements, this positive value comes from adding the + // difference's sign bit to the difference itself. This is equivalent to + // the 32 bit and 64 bit case: pos_diff = diff + sign_bit (diff) SDValue Add = DiffDiv.getOperand(0); if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff) return SDValue(); - // A logical right shift by 63 extracts the sign bit from the difference value + // A logical right shift by 63 extracts the sign bit from the difference SDValue Shift = Add.getOperand(1); if (Shift.getOpcode() != ISD::SRL || Shift.getOperand(0) != Diff) return SDValue(); - if (auto *ShiftConst = dyn_cast(Shift.getOperand(1)); !ShiftConst || ShiftConst->getZExtValue() != 63) + if (auto *ShiftConst = dyn_cast(Shift.getOperand(1)); + !ShiftConst || ShiftConst->getZExtValue() != 63) return SDValue(); } } else if (LaneMask.getOperand(2) != Diff) From 82da88920b44ba31aa716025e7c75c0acc0aed20 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 1 Aug 2024 16:16:56 +0100 Subject: [PATCH 15/19] Pass subtarget from caller --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 588bd3e54882a..db5562a2c4265 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13788,8 +13788,8 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { /// one loop iteration. This tries to match: /// or (splat (setcc_lt (sub ptrA, ptrB), -(element_size - 1))), /// (get_active_lane_mask 0, (div (sub ptrA, ptrB), element_size)) -SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { - if (!DAG.getSubtarget().hasSVE2()) +SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget &Subtarget) { + if (!Subtarget.hasSVE2()) return SDValue(); SDValue LaneMask = Op.getOperand(0); SDValue Splat = Op.getOperand(1); @@ -13903,7 +13903,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { - if (SDValue SV = tryWhileWRFromOR(Op, DAG)) + if (SDValue SV = tryWhileWRFromOR(Op, DAG, DAG.getSubtarget())) return SV; if (useSVEForFixedLengthVectorVT(Op.getValueType(), !Subtarget->isNeonAvailable())) From 4959ebadf85f62e569fdb597225550de7f44b723 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 1 Aug 2024 16:17:09 +0100 Subject: [PATCH 16/19] Remove dso_local from loop tests --- llvm/test/CodeGen/AArch64/whilewr.ll | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll index 67959112705a1..18966a647890b 100644 --- a/llvm/test/CodeGen/AArch64/whilewr.ll +++ b/llvm/test/CodeGen/AArch64/whilewr.ll @@ -207,7 +207,7 @@ entry: ret %active.lane.mask.alias } -define dso_local void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 @@ -301,7 +301,7 @@ for.cond.cleanup: ret void } -define dso_local void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 @@ -394,7 +394,7 @@ for.cond.cleanup: ret void } -define dso_local void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 @@ -489,7 +489,7 @@ for.cond.cleanup: ret void } -define dso_local void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 @@ -584,7 +584,7 @@ for.cond.cleanup: ret void } -define dso_local void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) { +define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_multiple_8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 @@ -696,7 +696,7 @@ for.cond.cleanup: ret void } -define dso_local void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) { +define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_multiple_16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 @@ -814,7 +814,7 @@ for.cond.cleanup: ret void } -define dso_local void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) { +define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_multiple_32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 @@ -936,7 +936,7 @@ for.cond.cleanup: ret void } -define dso_local void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) { +define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_multiple_64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 From 2072294d63594e7b6610a454bcd289b24373bb6c Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 1 Aug 2024 16:20:04 +0100 Subject: [PATCH 17/19] Use isNullConstant --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index db5562a2c4265..361daefabf09b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13821,8 +13821,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget & if (Diff.getOpcode() != ISD::SUB || Diff.getValueType() != MVT::i64) return SDValue(); - auto LaneMaskConst = dyn_cast(LaneMask.getOperand(1)); - if (!LaneMaskConst || LaneMaskConst->getZExtValue() != 0 || + if (!isNullConstant(LaneMask.getOperand(1)) || (EltSize != 1 && LaneMask.getOperand(2).getOpcode() != ISD::SRA)) return SDValue(); @@ -13844,8 +13843,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget & if (Select.getOpcode() != ISD::SELECT_CC || Select.getOperand(3) != Diff) return SDValue(); // Make sure it's checking if the difference is less than 0 - if (auto *SelectConst = dyn_cast(Select.getOperand(1)); - !SelectConst || SelectConst->getZExtValue() != 0 || + if (!isNullConstant(Select.getOperand(1)) || cast(Select.getOperand(4))->get() != ISD::CondCode::SETLT) return SDValue(); From 46602d8c8cfcbda2e29af19fd949f4e462d9744d Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 1 Aug 2024 16:20:13 +0100 Subject: [PATCH 18/19] Fix new lines --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +- llvm/test/CodeGen/AArch64/whilewr.ll | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 361daefabf09b..04ebfe51bf8a8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13900,9 +13900,9 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget & SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { - if (SDValue SV = tryWhileWRFromOR(Op, DAG, DAG.getSubtarget())) return SV; + if (useSVEForFixedLengthVectorVT(Op.getValueType(), !Subtarget->isNeonAvailable())) return LowerToScalableOp(Op, DAG); diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll index 18966a647890b..9f1ea85079238 100644 --- a/llvm/test/CodeGen/AArch64/whilewr.ll +++ b/llvm/test/CodeGen/AArch64/whilewr.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s ; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2 + define @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_8: ; CHECK: // %bb.0: // %entry From c30aa1041501790bcc9b62cda6f1923d1383a64a Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 1 Aug 2024 17:36:11 +0100 Subject: [PATCH 19/19] format --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 04ebfe51bf8a8..a22bd5e15e5c3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13788,7 +13788,8 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { /// one loop iteration. This tries to match: /// or (splat (setcc_lt (sub ptrA, ptrB), -(element_size - 1))), /// (get_active_lane_mask 0, (div (sub ptrA, ptrB), element_size)) -SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget &Subtarget) { +SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, + const AArch64Subtarget &Subtarget) { if (!Subtarget.hasSVE2()) return SDValue(); SDValue LaneMask = Op.getOperand(0); @@ -13900,7 +13901,8 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget & SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { - if (SDValue SV = tryWhileWRFromOR(Op, DAG, DAG.getSubtarget())) + if (SDValue SV = + tryWhileWRFromOR(Op, DAG, DAG.getSubtarget())) return SV; if (useSVEForFixedLengthVectorVT(Op.getValueType(),