Skip to content

Commit 02db2de

Browse files
authored
[AArch64][SVE] Implement demanded bits for @llvm.aarch64.sve.cntp (#168714)
This allows DemandedBits to see that the SVE CNTP intrinsic will only ever produce small positive integers. The maximum value you could get here is 256, which is CNTP on a nxv16i1 on a machine with a 2048bit vector size (the maximum for SVE). Using this various redundant operations (zexts, sexts, ands, ors, etc) can be eliminated.
1 parent 0a88e96 commit 02db2de

File tree

3 files changed

+101
-27
lines changed

3 files changed

+101
-27
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 40 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19443,20 +19443,37 @@ AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
1944319443
return CSNeg;
1944419444
}
1944519445

19446-
static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
19446+
static bool IsSVECntIntrinsic(SDValue S) {
1944719447
switch(getIntrinsicID(S.getNode())) {
1944819448
default:
1944919449
break;
1945019450
case Intrinsic::aarch64_sve_cntb:
19451-
return 8;
1945219451
case Intrinsic::aarch64_sve_cnth:
19453-
return 16;
1945419452
case Intrinsic::aarch64_sve_cntw:
19455-
return 32;
1945619453
case Intrinsic::aarch64_sve_cntd:
19457-
return 64;
19454+
return true;
19455+
}
19456+
return false;
19457+
}
19458+
19459+
// Returns the maximum (scalable) value that can be returned by an SVE count
19460+
// intrinsic. Returns std::nullopt if \p Op is not aarch64_sve_cnt*.
19461+
static std::optional<ElementCount> getMaxValueForSVECntIntrinsic(SDValue Op) {
19462+
Intrinsic::ID IID = getIntrinsicID(Op.getNode());
19463+
if (IID == Intrinsic::aarch64_sve_cntp)
19464+
return Op.getOperand(1).getValueType().getVectorElementCount();
19465+
switch (IID) {
19466+
case Intrinsic::aarch64_sve_cntd:
19467+
return ElementCount::getScalable(2);
19468+
case Intrinsic::aarch64_sve_cntw:
19469+
return ElementCount::getScalable(4);
19470+
case Intrinsic::aarch64_sve_cnth:
19471+
return ElementCount::getScalable(8);
19472+
case Intrinsic::aarch64_sve_cntb:
19473+
return ElementCount::getScalable(16);
19474+
default:
19475+
return std::nullopt;
1945819476
}
19459-
return {};
1946019477
}
1946119478

1946219479
/// Calculates what the pre-extend type is, based on the extension
@@ -31666,22 +31683,24 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
3166631683
return false;
3166731684
}
3166831685
case ISD::INTRINSIC_WO_CHAIN: {
31669-
if (auto ElementSize = IsSVECntIntrinsic(Op)) {
31670-
unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
31671-
if (!MaxSVEVectorSizeInBits)
31672-
MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
31673-
unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
31674-
// The SVE count intrinsics don't support the multiplier immediate so we
31675-
// don't have to account for that here. The value returned may be slightly
31676-
// over the true required bits, as this is based on the "ALL" pattern. The
31677-
// other patterns are also exposed by these intrinsics, but they all
31678-
// return a value that's strictly less than "ALL".
31679-
unsigned RequiredBits = llvm::bit_width(MaxElements);
31680-
unsigned BitWidth = Known.Zero.getBitWidth();
31681-
if (RequiredBits < BitWidth)
31682-
Known.Zero.setHighBits(BitWidth - RequiredBits);
31686+
std::optional<ElementCount> MaxCount = getMaxValueForSVECntIntrinsic(Op);
31687+
if (!MaxCount)
3168331688
return false;
31684-
}
31689+
unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
31690+
if (!MaxSVEVectorSizeInBits)
31691+
MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
31692+
unsigned VscaleMax = MaxSVEVectorSizeInBits / 128;
31693+
unsigned MaxValue = MaxCount->getKnownMinValue() * VscaleMax;
31694+
// The SVE count intrinsics don't support the multiplier immediate so we
31695+
// don't have to account for that here. The value returned may be slightly
31696+
// over the true required bits, as this is based on the "ALL" pattern. The
31697+
// other patterns are also exposed by these intrinsics, but they all
31698+
// return a value that's strictly less than "ALL".
31699+
unsigned RequiredBits = llvm::bit_width(MaxValue);
31700+
unsigned BitWidth = Known.Zero.getBitWidth();
31701+
if (RequiredBits < BitWidth)
31702+
Known.Zero.setHighBits(BitWidth - RequiredBits);
31703+
return false;
3168531704
}
3168631705
}
3168731706

llvm/test/CodeGen/AArch64/sve-vector-compress.ll

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -143,20 +143,19 @@ define <vscale x 8 x i32> @test_compress_large(<vscale x 8 x i32> %vec, <vscale
143143
; CHECK-NEXT: addvl sp, sp, #-2
144144
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
145145
; CHECK-NEXT: .cfi_offset w29, -16
146-
; CHECK-NEXT: punpklo p2.h, p0.b
146+
; CHECK-NEXT: punpklo p1.h, p0.b
147147
; CHECK-NEXT: cnth x9
148-
; CHECK-NEXT: ptrue p1.s
148+
; CHECK-NEXT: ptrue p2.s
149149
; CHECK-NEXT: sub x9, x9, #1
150150
; CHECK-NEXT: punpkhi p0.h, p0.b
151-
; CHECK-NEXT: compact z0.s, p2, z0.s
152-
; CHECK-NEXT: cntp x8, p1, p2.s
151+
; CHECK-NEXT: compact z0.s, p1, z0.s
152+
; CHECK-NEXT: cntp x8, p2, p1.s
153153
; CHECK-NEXT: compact z1.s, p0, z1.s
154154
; CHECK-NEXT: str z0, [sp]
155-
; CHECK-NEXT: mov w8, w8
156155
; CHECK-NEXT: cmp x8, x9
157156
; CHECK-NEXT: csel x8, x8, x9, lo
158157
; CHECK-NEXT: mov x9, sp
159-
; CHECK-NEXT: st1w { z1.s }, p1, [x9, x8, lsl #2]
158+
; CHECK-NEXT: st1w { z1.s }, p2, [x9, x8, lsl #2]
160159
; CHECK-NEXT: ldr z0, [sp]
161160
; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
162161
; CHECK-NEXT: addvl sp, sp, #2

llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,62 @@ define i64 @cntd_and_elimination() {
8080
ret i64 %result
8181
}
8282

83+
define i64 @cntp_nxv16i1_and_elimination(<vscale x 16 x i1> %p) {
84+
; CHECK-LABEL: cntp_nxv16i1_and_elimination:
85+
; CHECK: // %bb.0:
86+
; CHECK-NEXT: cntp x8, p0, p0.b
87+
; CHECK-NEXT: and x9, x8, #0x1fc
88+
; CHECK-NEXT: add x0, x8, x9
89+
; CHECK-NEXT: ret
90+
%cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv16i1(<vscale x 16 x i1> %p, <vscale x 16 x i1> %p)
91+
%and_redundant = and i64 %cntp, 511
92+
%and_required = and i64 %cntp, 17179869180
93+
%result = add i64 %and_redundant, %and_required
94+
ret i64 %result
95+
}
96+
97+
define i64 @cntp_nxv8i1_and_elimination(<vscale x 8 x i1> %p) {
98+
; CHECK-LABEL: cntp_nxv8i1_and_elimination:
99+
; CHECK: // %bb.0:
100+
; CHECK-NEXT: cntp x8, p0, p0.h
101+
; CHECK-NEXT: and x9, x8, #0xfc
102+
; CHECK-NEXT: add x0, x8, x9
103+
; CHECK-NEXT: ret
104+
%cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv8i1(<vscale x 8 x i1> %p, <vscale x 8 x i1> %p)
105+
%and_redundant = and i64 %cntp, 1023
106+
%and_required = and i64 %cntp, 17179869180
107+
%result = add i64 %and_redundant, %and_required
108+
ret i64 %result
109+
}
110+
111+
define i64 @cntp_nxv4i1_and_elimination(<vscale x 4 x i1> %p) {
112+
; CHECK-LABEL: cntp_nxv4i1_and_elimination:
113+
; CHECK: // %bb.0:
114+
; CHECK-NEXT: cntp x8, p0, p0.s
115+
; CHECK-NEXT: and x9, x8, #0x7c
116+
; CHECK-NEXT: add x0, x8, x9
117+
; CHECK-NEXT: ret
118+
%cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1> %p, <vscale x 4 x i1> %p)
119+
%and_redundant = and i64 %cntp, 127
120+
%and_required = and i64 %cntp, 17179869180
121+
%result = add i64 %and_redundant, %and_required
122+
ret i64 %result
123+
}
124+
125+
define i64 @cntp_nxv2i1_and_elimination(<vscale x 2 x i1> %p) {
126+
; CHECK-LABEL: cntp_nxv2i1_and_elimination:
127+
; CHECK: // %bb.0:
128+
; CHECK-NEXT: cntp x8, p0, p0.d
129+
; CHECK-NEXT: and x9, x8, #0x3c
130+
; CHECK-NEXT: add x0, x8, x9
131+
; CHECK-NEXT: ret
132+
%cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv2i1(<vscale x 2 x i1> %p, <vscale x 2 x i1> %p)
133+
%and_redundant = and i64 %cntp, 63
134+
%and_required = and i64 %cntp, 17179869180
135+
%result = add i64 %and_redundant, %and_required
136+
ret i64 %result
137+
}
138+
83139
define i64 @vscale_trunc_zext() vscale_range(1,16) {
84140
; CHECK-LABEL: vscale_trunc_zext:
85141
; CHECK: // %bb.0:

0 commit comments

Comments
 (0)