diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index fede586cf35bc..280cc11dbdafe 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5895,6 +5895,15 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, SrcTy = DstTy; } + // Check for identity masks, which we can treat as free for both fixed and + // scalable vector paths. + if (!Mask.empty() && LT.second.isFixedLengthVector() && + (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && + all_of(enumerate(Mask), [](const auto &M) { + return M.value() < 0 || M.value() == (int)M.index(); + })) + return 0; + // Segmented shuffle matching. if (Kind == TTI::SK_PermuteSingleSrc && isa(SrcTy) && !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() && @@ -5942,14 +5951,6 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, all_of(Mask, [](int E) { return E < 8; })) return getPerfectShuffleCost(Mask); - // Check for identity masks, which we can treat as free. - if (!Mask.empty() && LT.second.isFixedLengthVector() && - (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && - all_of(enumerate(Mask), [](const auto &M) { - return M.value() < 0 || M.value() == (int)M.index(); - })) - return 0; - // Check for other shuffles that are not SK_ kinds but we have native // instructions for, for example ZIP and UZP. unsigned Unused; diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll index 4579acb9b3555..76be4dc4b19fb 100644 --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll @@ -490,3 +490,15 @@ define void @vst4(ptr %p) { ret void } + +define <16 x i8> @identity_shuffle_costs() #0 { +bb: + ; CHECK-LABEL: 'identity_shuffle_costs' + ; CHECK: Cost Model: Found costs of 0 for: %shufflevector142 = shufflevector <16 x i8> %trunc125, <16 x i8> %trunc133, <16 x i32> + %trunc125 = trunc <16 x i32> zeroinitializer to <16 x i8> + %trunc133 = trunc <16 x i32> zeroinitializer to <16 x i8> + %shufflevector142 = shufflevector <16 x i8> %trunc125, <16 x i8> %trunc133, <16 x i32> + ret <16 x i8> %shufflevector142 +} + +attributes #0 = { "target-features"="+sve,+neon" } diff --git a/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll b/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll new file mode 100644 index 0000000000000..0a668f856fd3e --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll @@ -0,0 +1,69 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=vector-combine -mtriple=aarch64-unknown-linux-gnu -S %s | FileCheck %s +target triple = "aarch64-unknown-linux-gnu" + +define i32 @ham(ptr %call12) local_unnamed_addr #0 { +; CHECK-LABEL: define i32 @ham( +; CHECK-SAME: ptr [[CALL12:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK: [[TMP2:%.*]] = mul nuw nsw <32 x i32> [[TMP1:%.*]], + +; CHECK: [[TMP5:%.*]] = mul nuw <32 x i32> [[TMP4:%.*]], + +; CHECK: [[TMP8:%.*]] = mul nuw <32 x i32> [[TMP7:%.*]], + +bb: + br label %bb48 + +bb48: ; preds = %bb48, %bb + %phi49 = phi i64 [ 0, %bb ], [ %add86, %bb48 ] + %mul50 = mul i64 %phi49, 3 + %getelementptr53 = getelementptr i8, ptr %call12, i64 %mul50 + %load54 = load <48 x i8>, ptr %getelementptr53, align 1, !tbaa !10, !alias.scope !18 + %shufflevector = shufflevector <48 x i8> %load54, <48 x i8> poison, <16 x i32> + %zext57 = zext <16 x i8> %shufflevector to <16 x i32> + %mul58 = mul nuw nsw <16 x i32> %zext57, splat (i32 19595) + %zext59 = zext <16 x i8> %shufflevector to <16 x i32> + %mul60 = mul nuw nsw <16 x i32> %zext59, splat (i32 38470) + %zext61 = zext <16 x i8> %shufflevector to <16 x i32> + %mul62 = mul nuw nsw <16 x i32> %zext61, splat (i32 7471) + %add63 = add nuw nsw <16 x i32> %mul58, splat (i32 32768) + %add64 = add nuw nsw <16 x i32> %add63, %mul60 + %add65 = add nuw nsw <16 x i32> %add64, %mul62 + %lshr = lshr <16 x i32> %add65, splat (i32 16) + %trunc66 = trunc nuw <16 x i32> %lshr to <16 x i8> + %mul67 = mul nuw nsw <16 x i32> %zext57, splat (i32 32767) + %mul68 = mul nuw <16 x i32> %zext59, splat (i32 16762097) + %mul69 = mul nuw <16 x i32> %zext61, splat (i32 16759568) + %add70 = add nuw nsw <16 x i32> %mul67, splat (i32 32768) + %add71 = add nuw <16 x i32> %add70, %mul68 + %add72 = add <16 x i32> %add71, %mul69 + %lshr73 = lshr <16 x i32> %add72, splat (i32 16) + %trunc74 = trunc <16 x i32> %lshr73 to <16 x i8> + %mul75 = mul nuw nsw <16 x i32> %zext57, splat (i32 13282) + %mul76 = mul nuw <16 x i32> %zext59, splat (i32 16744449) + %mul77 = mul nuw nsw <16 x i32> %zext61, splat (i32 19485) + %add78 = add nuw nsw <16 x i32> %mul75, splat (i32 32768) + %add79 = add nuw <16 x i32> %add78, %mul76 + %add80 = add nuw <16 x i32> %add79, %mul77 + %lshr81 = lshr <16 x i32> %add80, splat (i32 16) + %trunc82 = trunc <16 x i32> %lshr81 to <16 x i8> + %shufflevector83 = shufflevector <16 x i8> %trunc66, <16 x i8> %trunc74, <32 x i32> + %shufflevector84 = shufflevector <16 x i8> %trunc82, <16 x i8> poison, <32 x i32> + store <32 x i8> %shufflevector83, ptr %getelementptr53, align 1, !tbaa !10, !noalias !18 + %add86 = add nuw i64 %phi49, 16 + %icmp87 = icmp eq i64 %add86, %mul50 + br i1 %icmp87, label %bb205, label %bb48 + +bb205: ; preds = %bb48, %bb + ret i32 0 +} + +attributes #0 = { nounwind uwtable vscale_range(1,16) "target-cpu"="cortex-a57" "target-features"="+sve,+neon"} + +!8 = !{!"omnipotent char", !9, i64 0} +!9 = !{!"Simple C/C++ TBAA"} +!10 = !{!8, !8, i64 0} +!12 = !{!"int", !8, i64 0} +!18 = !{!19} +!19 = distinct !{!19, !20} +!20 = distinct !{!20, !"LVerDomain"}