diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index a97b0d3b1db92..0f3577468a8b4 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5168,26 +5168,45 @@ bool AArch64TTIImpl::isProfitableToSinkOperands( return false; } case Instruction::Mul: { + auto ShouldSinkSplatForIndexedVariant = [](Value *V) { + auto *Ty = cast(V->getType()); + // For SVE the lane-indexing is within 128-bits, so we can't fold splats. + if (Ty->isScalableTy()) + return false; + + // Indexed variants of Mul exist for i16 and i32 element types only. + return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32; + }; + int NumZExts = 0, NumSExts = 0; for (auto &Op : I->operands()) { // Make sure we are not already sinking this operand if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) continue; - if (match(&Op, m_SExt(m_Value()))) { - NumSExts++; - continue; - } else if (match(&Op, m_ZExt(m_Value()))) { - NumZExts++; + if (match(&Op, m_ZExtOrSExt(m_Value()))) { + auto *Ext = cast(Op); + auto *ExtOp = Ext->getOperand(0); + if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp)) + Ops.push_back(&Ext->getOperandUse(0)); + Ops.push_back(&Op); + + if (isa(Ext)) + NumSExts++; + else + NumZExts++; + continue; } ShuffleVectorInst *Shuffle = dyn_cast(Op); + if (!Shuffle) + continue; // If the Shuffle is a splat and the operand is a zext/sext, sinking the // operand and the s/zext can help create indexed s/umull. This is // especially useful to prevent i64 mul being scalarized. - if (Shuffle && isSplatShuffle(Shuffle) && + if (isSplatShuffle(Shuffle) && match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) { Ops.push_back(&Shuffle->getOperandUse(0)); Ops.push_back(&Op); @@ -5198,9 +5217,6 @@ bool AArch64TTIImpl::isProfitableToSinkOperands( continue; } - if (!Shuffle) - continue; - Value *ShuffleOperand = Shuffle->getOperand(0); InsertElementInst *Insert = dyn_cast(ShuffleOperand); if (!Insert) @@ -5232,12 +5248,26 @@ bool AArch64TTIImpl::isProfitableToSinkOperands( NumZExts++; } + Ops.push_back(&Insert->getOperandUse(1)); Ops.push_back(&Shuffle->getOperandUse(0)); Ops.push_back(&Op); } - // Is it profitable to sink if we found two of the same type of extends. - return !Ops.empty() && (NumSExts == 2 || NumZExts == 2); + // It is profitable to sink if we found two of the same type of extends. + if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2)) + return true; + + // Otherwise, see if we should sink splats for indexed variants. + if (!ShouldSinkSplatForIndexedVariant(I)) + return false; + + Ops.clear(); + if (isSplatShuffle(I->getOperand(0))) + Ops.push_back(&I->getOperandUse(0)); + if (isSplatShuffle(I->getOperand(1))) + Ops.push_back(&I->getOperandUse(1)); + + return !Ops.empty(); } default: return false; diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll index ef54cc4bbf718..482135b721da4 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll @@ -10,14 +10,18 @@ target triple = "aarch64-unknown-linux-gnu" define dso_local i32 @dupext_crashtest(i32 %e) local_unnamed_addr { ; CHECK-LABEL: dupext_crashtest: ; CHECK: // %bb.0: // %for.body.lr.ph -; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: dup v0.2s, w8 ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr d1, [x8] -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: str d1, [x8] +; CHECK-NEXT: ldr d0, [x8] +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: mul w9, w0, w9 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: fmov d0, x9 +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: b .LBB0_1 for.body.lr.ph: %conv314 = zext i32 %e to i64 diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll index 0c7a61739695f..fb6575cc0ee83 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -5,9 +5,8 @@ define void @matrix_mul_unsigned(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) { ; CHECK-SD-LABEL: matrix_mul_unsigned: ; CHECK-SD: // %bb.0: // %vector.header -; CHECK-SD-NEXT: and w8, w3, #0xffff +; CHECK-SD-NEXT: dup v0.4h, w3 ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-SD-NEXT: dup v0.4h, w8 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 ; CHECK-SD-NEXT: .LBB0_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 @@ -91,9 +90,8 @@ for.end12: ; preds = %vector.body define void @matrix_mul_signed(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) { ; CHECK-SD-LABEL: matrix_mul_signed: ; CHECK-SD: // %bb.0: // %vector.header -; CHECK-SD-NEXT: sxth w8, w3 +; CHECK-SD-NEXT: dup v0.4h, w3 ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-SD-NEXT: dup v0.4h, w8 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 ; CHECK-SD-NEXT: .LBB1_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 @@ -179,9 +177,8 @@ for.end12: ; preds = %vector.body define void @matrix_mul_double_shuffle(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) { ; CHECK-SD-LABEL: matrix_mul_double_shuffle: ; CHECK-SD: // %bb.0: // %vector.header -; CHECK-SD-NEXT: and w8, w3, #0xffff +; CHECK-SD-NEXT: dup v0.4h, w3 ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-SD-NEXT: dup v0.4h, w8 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 killed $x0 def $x0 ; CHECK-SD-NEXT: .LBB2_1: // %vector.body @@ -261,44 +258,44 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-SD-NEXT: cmp w3, #1 ; CHECK-SD-NEXT: b.lt .LBB3_8 ; CHECK-SD-NEXT: // %bb.1: // %for.body.preheader -; CHECK-SD-NEXT: sxth w8, w1 ; CHECK-SD-NEXT: cmp w3, #15 -; CHECK-SD-NEXT: mov w9, w3 +; CHECK-SD-NEXT: mov w8, w3 ; CHECK-SD-NEXT: b.hi .LBB3_3 ; CHECK-SD-NEXT: // %bb.2: -; CHECK-SD-NEXT: mov x10, xzr +; CHECK-SD-NEXT: mov x9, xzr ; CHECK-SD-NEXT: b .LBB3_6 ; CHECK-SD-NEXT: .LBB3_3: // %vector.ph -; CHECK-SD-NEXT: dup v0.8h, w8 -; CHECK-SD-NEXT: and x10, x9, #0xfffffff0 -; CHECK-SD-NEXT: add x11, x2, #32 -; CHECK-SD-NEXT: add x12, x0, #16 -; CHECK-SD-NEXT: mov x13, x10 +; CHECK-SD-NEXT: dup v0.8h, w1 +; CHECK-SD-NEXT: and x9, x8, #0xfffffff0 +; CHECK-SD-NEXT: add x10, x2, #32 +; CHECK-SD-NEXT: add x11, x0, #16 +; CHECK-SD-NEXT: mov x12, x9 ; CHECK-SD-NEXT: .LBB3_4: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-SD-NEXT: ldp q1, q2, [x12, #-16] -; CHECK-SD-NEXT: subs x13, x13, #16 -; CHECK-SD-NEXT: add x12, x12, #32 +; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16] +; CHECK-SD-NEXT: subs x12, x12, #16 +; CHECK-SD-NEXT: add x11, x11, #32 ; CHECK-SD-NEXT: smull2 v3.4s, v0.8h, v1.8h ; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: smull2 v4.4s, v0.8h, v2.8h ; CHECK-SD-NEXT: smull v2.4s, v0.4h, v2.4h -; CHECK-SD-NEXT: stp q1, q3, [x11, #-32] -; CHECK-SD-NEXT: stp q2, q4, [x11], #64 +; CHECK-SD-NEXT: stp q1, q3, [x10, #-32] +; CHECK-SD-NEXT: stp q2, q4, [x10], #64 ; CHECK-SD-NEXT: b.ne .LBB3_4 ; CHECK-SD-NEXT: // %bb.5: // %middle.block -; CHECK-SD-NEXT: cmp x10, x9 +; CHECK-SD-NEXT: cmp x9, x8 ; CHECK-SD-NEXT: b.eq .LBB3_8 ; CHECK-SD-NEXT: .LBB3_6: // %for.body.preheader1 -; CHECK-SD-NEXT: add x11, x2, x10, lsl #2 -; CHECK-SD-NEXT: add x12, x0, x10, lsl #1 -; CHECK-SD-NEXT: sub x9, x9, x10 +; CHECK-SD-NEXT: sxth w10, w1 +; CHECK-SD-NEXT: add x11, x2, x9, lsl #2 +; CHECK-SD-NEXT: add x12, x0, x9, lsl #1 +; CHECK-SD-NEXT: sub x8, x8, x9 ; CHECK-SD-NEXT: .LBB3_7: // %for.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-SD-NEXT: ldrsh w10, [x12], #2 -; CHECK-SD-NEXT: subs x9, x9, #1 -; CHECK-SD-NEXT: mul w10, w10, w8 -; CHECK-SD-NEXT: str w10, [x11], #4 +; CHECK-SD-NEXT: ldrsh w9, [x12], #2 +; CHECK-SD-NEXT: subs x8, x8, #1 +; CHECK-SD-NEXT: mul w9, w9, w10 +; CHECK-SD-NEXT: str w9, [x11], #4 ; CHECK-SD-NEXT: b.ne .LBB3_7 ; CHECK-SD-NEXT: .LBB3_8: // %for.cond.cleanup ; CHECK-SD-NEXT: ret @@ -424,43 +421,43 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-SD-NEXT: b.lt .LBB4_8 ; CHECK-SD-NEXT: // %bb.1: // %for.body.preheader ; CHECK-SD-NEXT: cmp w3, #15 -; CHECK-SD-NEXT: and w8, w1, #0xffff -; CHECK-SD-NEXT: mov w9, w3 +; CHECK-SD-NEXT: mov w8, w3 ; CHECK-SD-NEXT: b.hi .LBB4_3 ; CHECK-SD-NEXT: // %bb.2: -; CHECK-SD-NEXT: mov x10, xzr +; CHECK-SD-NEXT: mov x9, xzr ; CHECK-SD-NEXT: b .LBB4_6 ; CHECK-SD-NEXT: .LBB4_3: // %vector.ph -; CHECK-SD-NEXT: dup v0.8h, w8 -; CHECK-SD-NEXT: and x10, x9, #0xfffffff0 -; CHECK-SD-NEXT: add x11, x2, #32 -; CHECK-SD-NEXT: add x12, x0, #16 -; CHECK-SD-NEXT: mov x13, x10 +; CHECK-SD-NEXT: dup v0.8h, w1 +; CHECK-SD-NEXT: and x9, x8, #0xfffffff0 +; CHECK-SD-NEXT: add x10, x2, #32 +; CHECK-SD-NEXT: add x11, x0, #16 +; CHECK-SD-NEXT: mov x12, x9 ; CHECK-SD-NEXT: .LBB4_4: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-SD-NEXT: ldp q1, q2, [x12, #-16] -; CHECK-SD-NEXT: subs x13, x13, #16 -; CHECK-SD-NEXT: add x12, x12, #32 +; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16] +; CHECK-SD-NEXT: subs x12, x12, #16 +; CHECK-SD-NEXT: add x11, x11, #32 ; CHECK-SD-NEXT: umull2 v3.4s, v0.8h, v1.8h ; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: umull2 v4.4s, v0.8h, v2.8h ; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h -; CHECK-SD-NEXT: stp q1, q3, [x11, #-32] -; CHECK-SD-NEXT: stp q2, q4, [x11], #64 +; CHECK-SD-NEXT: stp q1, q3, [x10, #-32] +; CHECK-SD-NEXT: stp q2, q4, [x10], #64 ; CHECK-SD-NEXT: b.ne .LBB4_4 ; CHECK-SD-NEXT: // %bb.5: // %middle.block -; CHECK-SD-NEXT: cmp x10, x9 +; CHECK-SD-NEXT: cmp x9, x8 ; CHECK-SD-NEXT: b.eq .LBB4_8 ; CHECK-SD-NEXT: .LBB4_6: // %for.body.preheader1 -; CHECK-SD-NEXT: add x11, x2, x10, lsl #2 -; CHECK-SD-NEXT: add x12, x0, x10, lsl #1 -; CHECK-SD-NEXT: sub x9, x9, x10 +; CHECK-SD-NEXT: add x10, x2, x9, lsl #2 +; CHECK-SD-NEXT: add x11, x0, x9, lsl #1 +; CHECK-SD-NEXT: and w12, w1, #0xffff +; CHECK-SD-NEXT: sub x8, x8, x9 ; CHECK-SD-NEXT: .LBB4_7: // %for.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-SD-NEXT: ldrh w10, [x12], #2 -; CHECK-SD-NEXT: subs x9, x9, #1 -; CHECK-SD-NEXT: mul w10, w10, w8 -; CHECK-SD-NEXT: str w10, [x11], #4 +; CHECK-SD-NEXT: ldrh w9, [x11], #2 +; CHECK-SD-NEXT: subs x8, x8, #1 +; CHECK-SD-NEXT: mul w9, w9, w12 +; CHECK-SD-NEXT: str w9, [x10], #4 ; CHECK-SD-NEXT: b.ne .LBB4_7 ; CHECK-SD-NEXT: .LBB4_8: // %for.cond.cleanup ; CHECK-SD-NEXT: ret @@ -470,47 +467,48 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-GI-NEXT: cmp w3, #0 ; CHECK-GI-NEXT: b.le .LBB4_7 ; CHECK-GI-NEXT: // %bb.1: // %for.body.preheader -; CHECK-GI-NEXT: mov x9, xzr +; CHECK-GI-NEXT: mov x8, xzr ; CHECK-GI-NEXT: cmp w3, #16 -; CHECK-GI-NEXT: and w8, w1, #0xffff -; CHECK-GI-NEXT: mov w10, w3 +; CHECK-GI-NEXT: mov w9, w3 ; CHECK-GI-NEXT: b.lo .LBB4_5 ; CHECK-GI-NEXT: // %bb.2: // %vector.ph -; CHECK-GI-NEXT: dup v0.4s, w8 -; CHECK-GI-NEXT: and x9, x10, #0xfffffff0 -; CHECK-GI-NEXT: add x11, x2, #32 -; CHECK-GI-NEXT: add x12, x0, #16 -; CHECK-GI-NEXT: mov x13, x9 +; CHECK-GI-NEXT: and x8, x9, #0xfffffff0 +; CHECK-GI-NEXT: add x10, x2, #32 +; CHECK-GI-NEXT: add x11, x0, #16 +; CHECK-GI-NEXT: mov x12, x8 ; CHECK-GI-NEXT: .LBB4_3: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-GI-NEXT: ldp q1, q2, [x12, #-16] -; CHECK-GI-NEXT: mov x14, x11 -; CHECK-GI-NEXT: subs x13, x13, #16 -; CHECK-GI-NEXT: add x12, x12, #32 -; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 +; CHECK-GI-NEXT: ldp q0, q1, [x11, #-16] +; CHECK-GI-NEXT: and w13, w1, #0xffff +; CHECK-GI-NEXT: dup v2.4s, w13 +; CHECK-GI-NEXT: mov x13, x10 +; CHECK-GI-NEXT: subs x12, x12, #16 +; CHECK-GI-NEXT: add x11, x11, #32 +; CHECK-GI-NEXT: ushll v3.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll v4.4s, v1.4h, #0 ; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-GI-NEXT: ushll v4.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-GI-NEXT: mul v3.4s, v0.4s, v3.4s -; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: mul v4.4s, v0.4s, v4.4s -; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s -; CHECK-GI-NEXT: stp q3, q1, [x14, #-32]! -; CHECK-GI-NEXT: stp q4, q2, [x11], #64 +; CHECK-GI-NEXT: mul v3.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s +; CHECK-GI-NEXT: mul v4.4s, v2.4s, v4.4s +; CHECK-GI-NEXT: mul v1.4s, v2.4s, v1.4s +; CHECK-GI-NEXT: stp q3, q0, [x13, #-32]! +; CHECK-GI-NEXT: stp q4, q1, [x10], #64 ; CHECK-GI-NEXT: b.ne .LBB4_3 ; CHECK-GI-NEXT: // %bb.4: // %middle.block -; CHECK-GI-NEXT: cmp x9, x10 +; CHECK-GI-NEXT: cmp x8, x9 ; CHECK-GI-NEXT: b.eq .LBB4_7 ; CHECK-GI-NEXT: .LBB4_5: // %for.body.preheader1 -; CHECK-GI-NEXT: add x11, x2, x9, lsl #2 -; CHECK-GI-NEXT: add x12, x0, x9, lsl #1 -; CHECK-GI-NEXT: sub x9, x10, x9 +; CHECK-GI-NEXT: add x10, x2, x8, lsl #2 +; CHECK-GI-NEXT: add x11, x0, x8, lsl #1 +; CHECK-GI-NEXT: and w12, w1, #0xffff +; CHECK-GI-NEXT: sub x8, x9, x8 ; CHECK-GI-NEXT: .LBB4_6: // %for.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-GI-NEXT: ldrh w10, [x12], #2 -; CHECK-GI-NEXT: subs x9, x9, #1 -; CHECK-GI-NEXT: mul w10, w10, w8 -; CHECK-GI-NEXT: str w10, [x11], #4 +; CHECK-GI-NEXT: ldrh w9, [x11], #2 +; CHECK-GI-NEXT: subs x8, x8, #1 +; CHECK-GI-NEXT: mul w9, w9, w12 +; CHECK-GI-NEXT: str w9, [x10], #4 ; CHECK-GI-NEXT: b.ne .LBB4_6 ; CHECK-GI-NEXT: .LBB4_7: // %for.cond.cleanup ; CHECK-GI-NEXT: ret @@ -600,7 +598,7 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A, ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 ; CHECK-SD-NEXT: and x11, x10, #0xfffffff0 -; CHECK-SD-NEXT: dup v2.8h, w9 +; CHECK-SD-NEXT: fmov s2, w9 ; CHECK-SD-NEXT: add x8, x0, #8 ; CHECK-SD-NEXT: mov x12, x11 ; CHECK-SD-NEXT: .LBB5_5: // %vector.body @@ -610,8 +608,8 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A, ; CHECK-SD-NEXT: add x8, x8, #16 ; CHECK-SD-NEXT: ushll v3.8h, v3.8b, #0 ; CHECK-SD-NEXT: ushll v4.8h, v4.8b, #0 -; CHECK-SD-NEXT: mla v0.8h, v2.8h, v3.8h -; CHECK-SD-NEXT: mla v1.8h, v2.8h, v4.8h +; CHECK-SD-NEXT: mla v0.8h, v3.8h, v2.h[0] +; CHECK-SD-NEXT: mla v1.8h, v4.8h, v2.h[0] ; CHECK-SD-NEXT: b.ne .LBB5_5 ; CHECK-SD-NEXT: // %bb.6: // %middle.block ; CHECK-SD-NEXT: add v0.8h, v1.8h, v0.8h @@ -1025,9 +1023,8 @@ exit: define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) { ; CHECK-SD-LABEL: matrix_mul_unsigned_and: ; CHECK-SD: // %bb.0: // %vector.header -; CHECK-SD-NEXT: and w8, w3, #0xffff +; CHECK-SD-NEXT: dup v0.4h, w3 ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-SD-NEXT: dup v0.4h, w8 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 ; CHECK-SD-NEXT: .LBB10_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1111,9 +1108,8 @@ for.end12: ; preds = %vector.body define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) { ; CHECK-SD-LABEL: matrix_mul_unsigned_and_double: ; CHECK-SD: // %bb.0: // %vector.header -; CHECK-SD-NEXT: and w8, w3, #0xffff +; CHECK-SD-NEXT: dup v0.8h, w3 ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-SD-NEXT: dup v0.8h, w8 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff0 ; CHECK-SD-NEXT: .LBB11_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1207,10 +1203,10 @@ for.end12: ; preds = %vector.body define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) { ; CHECK-SD-LABEL: matrix_mul_signed_and: ; CHECK-SD: // %bb.0: // %vector.header -; CHECK-SD-NEXT: and w8, w3, #0xffff +; CHECK-SD-NEXT: and w9, w3, #0xffff ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-SD-NEXT: dup v0.4s, w8 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 +; CHECK-SD-NEXT: fmov s0, w9 ; CHECK-SD-NEXT: .LBB12_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1 @@ -1220,8 +1216,8 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado ; CHECK-SD-NEXT: add w0, w0, #8 ; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-SD-NEXT: mul v1.4s, v0.4s, v1.4s -; CHECK-SD-NEXT: mul v2.4s, v0.4s, v2.4s +; CHECK-SD-NEXT: mul v1.4s, v1.4s, v0.s[0] +; CHECK-SD-NEXT: mul v2.4s, v2.4s, v0.s[0] ; CHECK-SD-NEXT: stp q1, q2, [x9] ; CHECK-SD-NEXT: b.ne .LBB12_1 ; CHECK-SD-NEXT: // %bb.2: // %for.end12 @@ -1295,10 +1291,10 @@ for.end12: ; preds = %vector.body define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) { ; CHECK-SD-LABEL: matrix_mul_signed_and_double: ; CHECK-SD: // %bb.0: // %vector.header -; CHECK-SD-NEXT: and w8, w3, #0xffff +; CHECK-SD-NEXT: and w9, w3, #0xffff ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-SD-NEXT: dup v0.4s, w8 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff0 +; CHECK-SD-NEXT: fmov s0, w9 ; CHECK-SD-NEXT: .LBB13_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1 @@ -1311,10 +1307,10 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur ; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: sshll2 v4.4s, v2.8h, #0 ; CHECK-SD-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-SD-NEXT: mul v3.4s, v0.4s, v3.4s -; CHECK-SD-NEXT: mul v1.4s, v0.4s, v1.4s -; CHECK-SD-NEXT: mul v4.4s, v0.4s, v4.4s -; CHECK-SD-NEXT: mul v2.4s, v0.4s, v2.4s +; CHECK-SD-NEXT: mul v3.4s, v3.4s, v0.s[0] +; CHECK-SD-NEXT: mul v1.4s, v1.4s, v0.s[0] +; CHECK-SD-NEXT: mul v4.4s, v4.4s, v0.s[0] +; CHECK-SD-NEXT: mul v2.4s, v2.4s, v0.s[0] ; CHECK-SD-NEXT: stp q1, q3, [x9] ; CHECK-SD-NEXT: stp q2, q4, [x9, #32] ; CHECK-SD-NEXT: b.ne .LBB13_1 diff --git a/llvm/test/CodeGen/AArch64/sink-mul-exts.ll b/llvm/test/CodeGen/AArch64/sink-mul-exts.ll new file mode 100644 index 0000000000000..d52ac7847f814 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sink-mul-exts.ll @@ -0,0 +1,232 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s + +define <8 x i16> @mul_splat_sext_v8i16(ptr %x, ptr %y) { +; CHECK-LABEL: mul_splat_sext_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: dup v1.8b, v1.b[3] +; CHECK-NEXT: .LBB0_1: // %l1 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d2, [x1, x8] +; CHECK-NEXT: add x8, x8, #4 +; CHECK-NEXT: cmp w8, #4 +; CHECK-NEXT: smlal v0.8h, v2.8b, v1.8b +; CHECK-NEXT: b.eq .LBB0_1 +; CHECK-NEXT: // %bb.2: // %l2 +; CHECK-NEXT: ret +entry: + %x.val = load <8 x i8>, ptr %x + %x.ext = sext <8 x i8> %x.val to <8 x i16> + %a = shufflevector <8 x i16> %x.ext, <8 x i16> undef, <8 x i32> + br label %l1 + +l1: + %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] + %q = phi <8 x i16> [ zeroinitializer, %entry ], [ %c, %l1 ] + %y.idx = mul nuw nsw i32 %p, 4 + %y.ptr = getelementptr i8, ptr %y, i32 %y.idx + %y.val = load <8 x i8>, ptr %y.ptr + %y.ext = sext <8 x i8> %y.val to <8 x i16> + %b = mul <8 x i16> %y.ext, %a + %c = add <8 x i16> %q, %b + %pa = add i32 %p, 1 + %c1 = icmp eq i32 %p, 0 + br i1 %c1, label %l1, label %l2 + +l2: + ret <8 x i16> %c +} + +define <4 x i32> @mul_splat_sext_v4i32(ptr %x, ptr %y) { +; CHECK-LABEL: mul_splat_sext_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: .LBB1_1: // %l1 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d2, [x1, x8] +; CHECK-NEXT: add x8, x8, #8 +; CHECK-NEXT: cmp w8, #8 +; CHECK-NEXT: smlal v0.4s, v2.4h, v1.h[3] +; CHECK-NEXT: b.eq .LBB1_1 +; CHECK-NEXT: // %bb.2: // %l2 +; CHECK-NEXT: ret +entry: + %x.val = load <4 x i16>, ptr %x + %x.ext = sext <4 x i16> %x.val to <4 x i32> + %a = shufflevector <4 x i32> %x.ext, <4 x i32> undef, <4 x i32> + br label %l1 + +l1: + %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] + %q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ] + %y.idx = mul nuw nsw i32 %p, 4 + %y.ptr = getelementptr i16, ptr %y, i32 %y.idx + %y.val = load <4 x i16>, ptr %y.ptr + %y.ext = sext <4 x i16> %y.val to <4 x i32> + %b = mul <4 x i32> %y.ext, %a + %c = add <4 x i32> %q, %b + %pa = add i32 %p, 1 + %c1 = icmp eq i32 %p, 0 + br i1 %c1, label %l1, label %l2 + +l2: + ret <4 x i32> %c +} + +define <2 x i64> @mul_splat_sext_v2i64(ptr %x, ptr %y) { +; CHECK-LABEL: mul_splat_sext_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: .LBB2_1: // %l1 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d2, [x1, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp w8, #16 +; CHECK-NEXT: smlal v0.2d, v2.2s, v1.s[1] +; CHECK-NEXT: b.eq .LBB2_1 +; CHECK-NEXT: // %bb.2: // %l2 +; CHECK-NEXT: ret +entry: + %x.val = load <2 x i32>, ptr %x + %x.ext = sext <2 x i32> %x.val to <2 x i64> + %a = shufflevector <2 x i64> %x.ext, <2 x i64> undef, <2 x i32> + br label %l1 + +l1: + %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] + %q = phi <2 x i64> [ zeroinitializer, %entry ], [ %c, %l1 ] + %y.idx = mul nuw nsw i32 %p, 4 + %y.ptr = getelementptr i32, ptr %y, i32 %y.idx + %y.val = load <2 x i32>, ptr %y.ptr + %y.ext = sext <2 x i32> %y.val to <2 x i64> + %b = mul <2 x i64> %y.ext, %a + %c = add <2 x i64> %q, %b + %pa = add i32 %p, 1 + %c1 = icmp eq i32 %p, 0 + br i1 %c1, label %l1, label %l2 + +l2: + ret <2 x i64> %c +} + +define <8 x i16> @mul_sext_splat_v8i16(ptr %x, ptr %y) { +; CHECK-LABEL: mul_sext_splat_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: dup v1.8b, v1.b[3] +; CHECK-NEXT: .LBB3_1: // %l1 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d2, [x1, x8] +; CHECK-NEXT: add x8, x8, #4 +; CHECK-NEXT: cmp w8, #4 +; CHECK-NEXT: smlal v0.8h, v2.8b, v1.8b +; CHECK-NEXT: b.eq .LBB3_1 +; CHECK-NEXT: // %bb.2: // %l2 +; CHECK-NEXT: ret +entry: + %x.val = load <8 x i8>, ptr %x + %x.spt = shufflevector <8 x i8> %x.val, <8 x i8> undef, <8 x i32> + %a = sext <8 x i8> %x.spt to <8 x i16> + br label %l1 + +l1: + %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] + %q = phi <8 x i16> [ zeroinitializer, %entry ], [ %c, %l1 ] + %y.idx = mul nuw nsw i32 %p, 4 + %y.ptr = getelementptr i8, ptr %y, i32 %y.idx + %y.val = load <8 x i8>, ptr %y.ptr + %y.ext = sext <8 x i8> %y.val to <8 x i16> + %b = mul <8 x i16> %y.ext, %a + %c = add <8 x i16> %q, %b + %pa = add i32 %p, 1 + %c1 = icmp eq i32 %p, 0 + br i1 %c1, label %l1, label %l2 + +l2: + ret <8 x i16> %c +} + +define <4 x i32> @mul_sext_splat_v4i32(ptr %x, ptr %y) { +; CHECK-LABEL: mul_sext_splat_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: .LBB4_1: // %l1 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d2, [x1, x8] +; CHECK-NEXT: add x8, x8, #8 +; CHECK-NEXT: cmp w8, #8 +; CHECK-NEXT: smlal v0.4s, v2.4h, v1.h[3] +; CHECK-NEXT: b.eq .LBB4_1 +; CHECK-NEXT: // %bb.2: // %l2 +; CHECK-NEXT: ret +entry: + %x.val = load <4 x i16>, ptr %x + %x.spt = shufflevector <4 x i16> %x.val, <4 x i16> undef, <4 x i32> + %a = sext <4 x i16> %x.spt to <4 x i32> + br label %l1 + +l1: + %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] + %q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ] + %y.idx = mul nuw nsw i32 %p, 4 + %y.ptr = getelementptr i16, ptr %y, i32 %y.idx + %y.val = load <4 x i16>, ptr %y.ptr + %y.ext = sext <4 x i16> %y.val to <4 x i32> + %b = mul <4 x i32> %y.ext, %a + %c = add <4 x i32> %q, %b + %pa = add i32 %p, 1 + %c1 = icmp eq i32 %p, 0 + br i1 %c1, label %l1, label %l2 + +l2: + ret <4 x i32> %c +} + +define <2 x i64> @mul_sext_splat_v2i64(ptr %x, ptr %y) { +; CHECK-LABEL: mul_sext_splat_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: .LBB5_1: // %l1 +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d2, [x1, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp w8, #16 +; CHECK-NEXT: smlal v0.2d, v2.2s, v1.s[1] +; CHECK-NEXT: b.eq .LBB5_1 +; CHECK-NEXT: // %bb.2: // %l2 +; CHECK-NEXT: ret +entry: + %x.val = load <2 x i32>, ptr %x + %x.spt = shufflevector <2 x i32> %x.val, <2 x i32> undef, <2 x i32> + %a = sext <2 x i32> %x.spt to <2 x i64> + br label %l1 + +l1: + %p = phi i32 [ 0, %entry ], [ %pa, %l1 ] + %q = phi <2 x i64> [ zeroinitializer, %entry ], [ %c, %l1 ] + %y.idx = mul nuw nsw i32 %p, 4 + %y.ptr = getelementptr i32, ptr %y, i32 %y.idx + %y.val = load <2 x i32>, ptr %y.ptr + %y.ext = sext <2 x i32> %y.val to <2 x i64> + %b = mul <2 x i64> %y.ext, %a + %c = add <2 x i64> %q, %b + %pa = add i32 %p, 1 + %c1 = icmp eq i32 %p, 0 + br i1 %c1, label %l1, label %l2 + +l2: + ret <2 x i64> %c +} diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll index d156ec079ae94..4cee60f2d8265 100644 --- a/llvm/test/CodeGen/AArch64/sinksplat.ll +++ b/llvm/test/CodeGen/AArch64/sinksplat.ll @@ -204,10 +204,9 @@ define <4 x i32> @mlal(<4 x i32> %x, ptr %y) { ; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: mov w8, #1 // =0x1 -; CHECK-NEXT: dup v1.4s, v1.s[3] ; CHECK-NEXT: .LBB6_1: // %l1 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mla v0.4s, v2.4s, v1.4s +; CHECK-NEXT: mla v0.4s, v2.4s, v1.s[3] ; CHECK-NEXT: subs w8, w8, #1 ; CHECK-NEXT: b.eq .LBB6_1 ; CHECK-NEXT: // %bb.2: // %l2