Skip to content

Commit 124fa5c

Browse files
[AArch64] - Improve costing for Identity shuffles for SVE targets. (#165375)
Identity masks can be treated as free when scalable vectorization is possible making the check agnostic of the vectorization policy fixed/scalable, This allows for aggressive vector combines for identity shuffle masks.
1 parent 576e1af commit 124fa5c

File tree

3 files changed

+82
-8
lines changed

3 files changed

+82
-8
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6000,6 +6000,15 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
60006000
SrcTy = DstTy;
60016001
}
60026002

6003+
// Check for identity masks, which we can treat as free for both fixed and
6004+
// scalable vector paths.
6005+
if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6006+
(Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6007+
all_of(enumerate(Mask), [](const auto &M) {
6008+
return M.value() < 0 || M.value() == (int)M.index();
6009+
}))
6010+
return 0;
6011+
60036012
// Segmented shuffle matching.
60046013
if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
60056014
!Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
@@ -6047,14 +6056,6 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
60476056
all_of(Mask, [](int E) { return E < 8; }))
60486057
return getPerfectShuffleCost(Mask);
60496058

6050-
// Check for identity masks, which we can treat as free.
6051-
if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6052-
(Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6053-
all_of(enumerate(Mask), [](const auto &M) {
6054-
return M.value() < 0 || M.value() == (int)M.index();
6055-
}))
6056-
return 0;
6057-
60586059
// Check for other shuffles that are not SK_ kinds but we have native
60596060
// instructions for, for example ZIP and UZP.
60606061
unsigned Unused;

llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,3 +490,15 @@ define void @vst4(ptr %p) {
490490

491491
ret void
492492
}
493+
494+
define void @identity_shuffle_costs() #0 {
495+
bb:
496+
; CHECK-LABEL: 'identity_shuffle_costs'
497+
; CHECK: Cost Model: Found costs of 0 for: %shufflevector142 = shufflevector <16 x i8> zeroinitializer, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
498+
; CHECK: Cost Model: Found costs of 0 for: %shufflevector84 = shufflevector <16 x i8> zeroinitializer, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
499+
%shufflevector142 = shufflevector <16 x i8> zeroinitializer, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
500+
%shufflevector84 = shufflevector <16 x i8> zeroinitializer, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
501+
ret void
502+
}
503+
504+
attributes #0 = { "target-features"="+sve,+neon" }
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
; NOTE: This test is expected to test the Identity shuffle costs as zero, regardless of scalable or fixed width shuffle vectors, As a result enabling aggressive vector-combine transforms.
2+
; RUN: opt -passes=vector-combine -S %s | FileCheck %s
3+
target triple = "aarch64-unknown-linux-gnu"
4+
5+
define i32 @ham(ptr %call12) #0 {
6+
; CHECK-LABEL: define i32 @ham(
7+
; CHECK-SAME: ptr [[CALL12:%.*]]) #[[ATTR0:[0-9]+]] {
8+
; CHECK: [[TMP2:%.*]] = mul nuw nsw <32 x i32> [[TMP1:%.*]], <i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
9+
10+
; CHECK: [[TMP5:%.*]] = mul nuw <32 x i32> [[TMP4:%.*]], <i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097>
11+
12+
; CHECK: [[TMP8:%.*]] = mul nuw <32 x i32> [[TMP7:%.*]], <i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568>
13+
14+
bb:
15+
br label %bb48
16+
17+
bb48: ; preds = %bb48, %bb
18+
%phi49 = phi i64 [ 0, %bb ], [ %add86, %bb48 ]
19+
%mul50 = mul i64 %phi49, 3
20+
%getelementptr53 = getelementptr i8, ptr %call12, i64 %mul50
21+
%load54 = load <48 x i8>, ptr %getelementptr53, align 1
22+
%shufflevector = shufflevector <48 x i8> %load54, <48 x i8> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
23+
%zext57 = zext <16 x i8> %shufflevector to <16 x i32>
24+
%mul58 = mul nuw nsw <16 x i32> %zext57, splat (i32 19595)
25+
%zext59 = zext <16 x i8> %shufflevector to <16 x i32>
26+
%mul60 = mul nuw nsw <16 x i32> %zext59, splat (i32 38470)
27+
%zext61 = zext <16 x i8> %shufflevector to <16 x i32>
28+
%mul62 = mul nuw nsw <16 x i32> %zext61, splat (i32 7471)
29+
%add63 = add nuw nsw <16 x i32> %mul58, splat (i32 32768)
30+
%add64 = add nuw nsw <16 x i32> %add63, %mul60
31+
%add65 = add nuw nsw <16 x i32> %add64, %mul62
32+
%lshr = lshr <16 x i32> %add65, splat (i32 16)
33+
%trunc66 = trunc nuw <16 x i32> %lshr to <16 x i8>
34+
%mul67 = mul nuw nsw <16 x i32> %zext57, splat (i32 32767)
35+
%mul68 = mul nuw <16 x i32> %zext59, splat (i32 16762097)
36+
%mul69 = mul nuw <16 x i32> %zext61, splat (i32 16759568)
37+
%add70 = add nuw nsw <16 x i32> %mul67, splat (i32 32768)
38+
%add71 = add nuw <16 x i32> %add70, %mul68
39+
%add72 = add <16 x i32> %add71, %mul69
40+
%lshr73 = lshr <16 x i32> %add72, splat (i32 16)
41+
%trunc74 = trunc <16 x i32> %lshr73 to <16 x i8>
42+
%mul75 = mul nuw nsw <16 x i32> %zext57, splat (i32 13282)
43+
%mul76 = mul nuw <16 x i32> %zext59, splat (i32 16744449)
44+
%mul77 = mul nuw nsw <16 x i32> %zext61, splat (i32 19485)
45+
%add78 = add nuw nsw <16 x i32> %mul75, splat (i32 32768)
46+
%add79 = add nuw <16 x i32> %add78, %mul76
47+
%add80 = add nuw <16 x i32> %add79, %mul77
48+
%lshr81 = lshr <16 x i32> %add80, splat (i32 16)
49+
%trunc82 = trunc <16 x i32> %lshr81 to <16 x i8>
50+
%shufflevector83 = shufflevector <16 x i8> %trunc66, <16 x i8> %trunc74, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
51+
%shufflevector84 = shufflevector <16 x i8> %trunc82, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
52+
store <32 x i8> %shufflevector83, ptr %getelementptr53, align 1
53+
%add86 = add nuw i64 %phi49, 16
54+
%icmp87 = icmp eq i64 %add86, %mul50
55+
br i1 %icmp87, label %bb205, label %bb48
56+
57+
bb205: ; preds = %bb48, %bb
58+
ret i32 0
59+
}
60+
61+
attributes #0 = { vscale_range(1,16) "target-features"="+sve,+neon"}

0 commit comments

Comments
 (0)