Skip to content

Commit 1c32b6f

Browse files
authored
[AArch64][ARM] Move ARM-specific InstCombine transforms into Transforms/Utils (#169589)
Back when `TargetTransformInfo::instCombineIntrinsic` was added in https://reviews.llvm.org/D81728, several transforms common to both ARM and AArch64 were kept in the non-target-specific `InstCombineCalls.cpp` so they could be shared between the two targets. I want to extend the transform of the `tbl` intrinsics into static `shufflevector`s in a similar manner to #169110 (right now it only works with a 64-bit `tbl1`, but `shufflevector` should allow it to work with up to 2 operands, and it can definitely work with 128-bit vectors). I think separating out the transform into a TTI hook is a prerequisite. ~~I'm not happy about creating an entirely new module for this and having to wire it up through CMake and everything, but I'm not sure about the alternatives. If any maintainers can think of a cleaner way of doing this, I'm very open to it.~~ I've moved the transforms into `Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp`, which is a lot simpler.
1 parent 2f86bc2 commit 1c32b6f

File tree

10 files changed

+223
-107
lines changed

10 files changed

+223
-107
lines changed
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
//===- ARMCommonInstCombineIntrinsic.h - Shared ARM/AArch64 opts *- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
///
9+
/// \file
10+
/// This file contains optimizations for ARM and AArch64 intrinsics that
11+
/// are shared between both architectures. These functions can be called from:
12+
/// - ARM TTI's instCombineIntrinsic (for arm_neon_* intrinsics)
13+
/// - AArch64 TTI's instCombineIntrinsic (for aarch64_neon_* and aarch64_sve_*
14+
/// intrinsics)
15+
///
16+
//===----------------------------------------------------------------------===//
17+
18+
#ifndef LLVM_TRANSFORMS_UTILS_ARMCOMMONINSTCOMBINEINTRINSIC_H
19+
#define LLVM_TRANSFORMS_UTILS_ARMCOMMONINSTCOMBINEINTRINSIC_H
20+
21+
#include "llvm/IR/IntrinsicInst.h"
22+
#include "llvm/IR/Value.h"
23+
#include "llvm/Transforms/InstCombine/InstCombiner.h"
24+
25+
namespace llvm {
26+
27+
namespace ARMCommon {
28+
29+
/// Convert a table lookup to shufflevector if the mask is constant.
30+
/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
31+
/// which case we could lower the shufflevector with rev64 instructions
32+
/// as it's actually a byte reverse.
33+
Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC);
34+
35+
/// Simplify NEON multiply-long intrinsics (smull, umull).
36+
/// These intrinsics perform widening multiplies: they multiply two vectors of
37+
/// narrow integers and produce a vector of wider integers. This function
38+
/// performs algebraic simplifications:
39+
/// 1. Multiply by zero => zero vector
40+
/// 2. Multiply by one => zero/sign-extend the non-one operand
41+
/// 3. Both operands constant => regular multiply that can be constant-folded
42+
/// later
43+
Instruction *simplifyNeonMultiply(IntrinsicInst &II, InstCombiner &IC,
44+
bool IsSigned);
45+
46+
/// Simplify AES encryption/decryption intrinsics (AESE, AESD).
47+
///
48+
/// ARM's AES instructions (AESE/AESD) XOR the data and the key, provided as
49+
/// separate arguments, before performing the encryption/decryption operation.
50+
/// We can fold that "internal" XOR with a previous one.
51+
Instruction *simplifyAES(IntrinsicInst &II, InstCombiner &IC);
52+
53+
} // namespace ARMCommon
54+
} // namespace llvm
55+
56+
#endif // LLVM_TRANSFORMS_UTILS_ARMCOMMONINSTCOMBINEINTRINSIC_H

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "llvm/Support/Debug.h"
2626
#include "llvm/TargetParser/AArch64TargetParser.h"
2727
#include "llvm/Transforms/InstCombine/InstCombiner.h"
28+
#include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h"
2829
#include "llvm/Transforms/Utils/UnrollLoop.h"
2930
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
3031
#include <algorithm>
@@ -2873,6 +2874,18 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
28732874
case Intrinsic::aarch64_neon_fmaxnm:
28742875
case Intrinsic::aarch64_neon_fminnm:
28752876
return instCombineMaxMinNM(IC, II);
2877+
case Intrinsic::aarch64_neon_tbl1:
2878+
return ARMCommon::simplifyNeonTbl1(II, IC);
2879+
case Intrinsic::aarch64_neon_smull:
2880+
case Intrinsic::aarch64_neon_umull: {
2881+
bool IsSigned = IID == Intrinsic::aarch64_neon_smull;
2882+
return ARMCommon::simplifyNeonMultiply(II, IC, IsSigned);
2883+
}
2884+
case Intrinsic::aarch64_crypto_aesd:
2885+
case Intrinsic::aarch64_crypto_aese:
2886+
case Intrinsic::aarch64_sve_aesd:
2887+
case Intrinsic::aarch64_sve_aese:
2888+
return ARMCommon::simplifyAES(II, IC);
28762889
case Intrinsic::aarch64_sve_convert_from_svbool:
28772890
return instCombineConvertFromSVBool(IC, II);
28782891
case Intrinsic::aarch64_sve_dup:

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "llvm/Target/TargetMachine.h"
3232
#include "llvm/TargetParser/SubtargetFeature.h"
3333
#include "llvm/Transforms/InstCombine/InstCombiner.h"
34+
#include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h"
3435
#include "llvm/Transforms/Utils/Local.h"
3536
#include "llvm/Transforms/Utils/LoopUtils.h"
3637
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
@@ -187,6 +188,19 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
187188
break;
188189
}
189190

191+
case Intrinsic::arm_neon_vtbl1:
192+
return ARMCommon::simplifyNeonTbl1(II, IC);
193+
194+
case Intrinsic::arm_neon_vmulls:
195+
case Intrinsic::arm_neon_vmullu: {
196+
bool IsSigned = IID == Intrinsic::arm_neon_vmulls;
197+
return ARMCommon::simplifyNeonMultiply(II, IC, IsSigned);
198+
}
199+
200+
case Intrinsic::arm_neon_aesd:
201+
case Intrinsic::arm_neon_aese:
202+
return ARMCommon::simplifyAES(II, IC);
203+
190204
case Intrinsic::arm_mve_pred_i2v: {
191205
Value *Arg = II.getArgOperand(0);
192206
Value *ArgArg;

llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

Lines changed: 0 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -737,44 +737,6 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
737737
return nullptr;
738738
}
739739

740-
/// Convert a table lookup to shufflevector if the mask is constant.
741-
/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
742-
/// which case we could lower the shufflevector with rev64 instructions
743-
/// as it's actually a byte reverse.
744-
static Value *simplifyNeonTbl1(const IntrinsicInst &II,
745-
InstCombiner::BuilderTy &Builder) {
746-
// Bail out if the mask is not a constant.
747-
auto *C = dyn_cast<Constant>(II.getArgOperand(1));
748-
if (!C)
749-
return nullptr;
750-
751-
auto *VecTy = cast<FixedVectorType>(II.getType());
752-
unsigned NumElts = VecTy->getNumElements();
753-
754-
// Only perform this transformation for <8 x i8> vector types.
755-
if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
756-
return nullptr;
757-
758-
int Indexes[8];
759-
760-
for (unsigned I = 0; I < NumElts; ++I) {
761-
Constant *COp = C->getAggregateElement(I);
762-
763-
if (!COp || !isa<ConstantInt>(COp))
764-
return nullptr;
765-
766-
Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
767-
768-
// Make sure the mask indices are in range.
769-
if ((unsigned)Indexes[I] >= NumElts)
770-
return nullptr;
771-
}
772-
773-
auto *V1 = II.getArgOperand(0);
774-
auto *V2 = Constant::getNullValue(V1->getType());
775-
return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
776-
}
777-
778740
// Returns true iff the 2 intrinsics have the same operands, limiting the
779741
// comparison to the first NumOperands.
780742
static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
@@ -3166,72 +3128,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
31663128
Intrinsic::getOrInsertDeclaration(II->getModule(), NewIntrin);
31673129
return CallInst::Create(NewFn, CallArgs);
31683130
}
3169-
case Intrinsic::arm_neon_vtbl1:
3170-
case Intrinsic::aarch64_neon_tbl1:
3171-
if (Value *V = simplifyNeonTbl1(*II, Builder))
3172-
return replaceInstUsesWith(*II, V);
3173-
break;
3174-
3175-
case Intrinsic::arm_neon_vmulls:
3176-
case Intrinsic::arm_neon_vmullu:
3177-
case Intrinsic::aarch64_neon_smull:
3178-
case Intrinsic::aarch64_neon_umull: {
3179-
Value *Arg0 = II->getArgOperand(0);
3180-
Value *Arg1 = II->getArgOperand(1);
3181-
3182-
// Handle mul by zero first:
3183-
if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
3184-
return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
3185-
}
3186-
3187-
// Check for constant LHS & RHS - in this case we just simplify.
3188-
bool Zext = (IID == Intrinsic::arm_neon_vmullu ||
3189-
IID == Intrinsic::aarch64_neon_umull);
3190-
VectorType *NewVT = cast<VectorType>(II->getType());
3191-
if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
3192-
if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
3193-
Value *V0 = Builder.CreateIntCast(CV0, NewVT, /*isSigned=*/!Zext);
3194-
Value *V1 = Builder.CreateIntCast(CV1, NewVT, /*isSigned=*/!Zext);
3195-
return replaceInstUsesWith(CI, Builder.CreateMul(V0, V1));
3196-
}
3197-
3198-
// Couldn't simplify - canonicalize constant to the RHS.
3199-
std::swap(Arg0, Arg1);
3200-
}
3201-
3202-
// Handle mul by one:
3203-
if (Constant *CV1 = dyn_cast<Constant>(Arg1))
3204-
if (ConstantInt *Splat =
3205-
dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
3206-
if (Splat->isOne())
3207-
return CastInst::CreateIntegerCast(Arg0, II->getType(),
3208-
/*isSigned=*/!Zext);
3209-
3210-
break;
3211-
}
3212-
case Intrinsic::arm_neon_aesd:
3213-
case Intrinsic::arm_neon_aese:
3214-
case Intrinsic::aarch64_crypto_aesd:
3215-
case Intrinsic::aarch64_crypto_aese:
3216-
case Intrinsic::aarch64_sve_aesd:
3217-
case Intrinsic::aarch64_sve_aese: {
3218-
Value *DataArg = II->getArgOperand(0);
3219-
Value *KeyArg = II->getArgOperand(1);
3220-
3221-
// Accept zero on either operand.
3222-
if (!match(KeyArg, m_ZeroInt()))
3223-
std::swap(KeyArg, DataArg);
3224-
3225-
// Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
3226-
Value *Data, *Key;
3227-
if (match(KeyArg, m_ZeroInt()) &&
3228-
match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
3229-
replaceOperand(*II, 0, Data);
3230-
replaceOperand(*II, 1, Key);
3231-
return II;
3232-
}
3233-
break;
3234-
}
32353131
case Intrinsic::hexagon_V6_vandvrt:
32363132
case Intrinsic::hexagon_V6_vandvrt_128B: {
32373133
// Simplify Q -> V -> Q conversion.
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
//===- ARMCommonInstCombineIntrinsic.cpp - Shared ARM/AArch64 opts -------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
///
9+
/// \file
10+
/// This file contains optimizations for ARM and AArch64 intrinsics that
11+
/// are shared between both architectures. These functions can be called from:
12+
/// - ARM TTI's instCombineIntrinsic (for arm_neon_* intrinsics)
13+
/// - AArch64 TTI's instCombineIntrinsic (for aarch64_neon_* and aarch64_sve_*
14+
/// intrinsics)
15+
///
16+
//===----------------------------------------------------------------------===//
17+
18+
#include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h"
19+
#include "llvm/IR/Constants.h"
20+
#include "llvm/IR/IntrinsicInst.h"
21+
#include "llvm/IR/Value.h"
22+
#include "llvm/Transforms/InstCombine/InstCombiner.h"
23+
24+
using namespace llvm;
25+
using namespace llvm::PatternMatch;
26+
27+
namespace llvm {
28+
namespace ARMCommon {
29+
30+
/// Convert a table lookup to shufflevector if the mask is constant.
31+
/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
32+
/// which case we could lower the shufflevector with rev64 instructions
33+
/// as it's actually a byte reverse.
34+
Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC) {
35+
// Bail out if the mask is not a constant.
36+
auto *C = dyn_cast<Constant>(II.getArgOperand(1));
37+
if (!C)
38+
return nullptr;
39+
40+
auto *VecTy = cast<FixedVectorType>(II.getType());
41+
unsigned NumElts = VecTy->getNumElements();
42+
43+
// Only perform this transformation for <8 x i8> vector types.
44+
if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
45+
return nullptr;
46+
47+
int Indexes[8];
48+
49+
for (unsigned I = 0; I < NumElts; ++I) {
50+
Constant *COp = C->getAggregateElement(I);
51+
52+
if (!COp || !isa<ConstantInt>(COp))
53+
return nullptr;
54+
55+
Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
56+
57+
// Make sure the mask indices are in range.
58+
if ((unsigned)Indexes[I] >= NumElts)
59+
return nullptr;
60+
}
61+
62+
auto *V1 = II.getArgOperand(0);
63+
auto *V2 = Constant::getNullValue(V1->getType());
64+
Value *Shuf = IC.Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
65+
return IC.replaceInstUsesWith(II, Shuf);
66+
}
67+
68+
/// Simplify NEON multiply-long intrinsics (smull, umull).
69+
/// These intrinsics perform widening multiplies: they multiply two vectors of
70+
/// narrow integers and produce a vector of wider integers. This function
71+
/// performs algebraic simplifications:
72+
/// 1. Multiply by zero => zero vector
73+
/// 2. Multiply by one => zero/sign-extend the non-one operand
74+
/// 3. Both operands constant => regular multiply that can be constant-folded
75+
/// later
76+
Instruction *simplifyNeonMultiply(IntrinsicInst &II, InstCombiner &IC,
77+
bool IsSigned) {
78+
Value *Arg0 = II.getArgOperand(0);
79+
Value *Arg1 = II.getArgOperand(1);
80+
81+
// Handle mul by zero first:
82+
if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
83+
return IC.replaceInstUsesWith(II, ConstantAggregateZero::get(II.getType()));
84+
}
85+
86+
// Check for constant LHS & RHS - in this case we just simplify.
87+
VectorType *NewVT = cast<VectorType>(II.getType());
88+
if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
89+
if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
90+
Value *V0 = IC.Builder.CreateIntCast(CV0, NewVT, IsSigned);
91+
Value *V1 = IC.Builder.CreateIntCast(CV1, NewVT, IsSigned);
92+
return IC.replaceInstUsesWith(II, IC.Builder.CreateMul(V0, V1));
93+
}
94+
95+
// Couldn't simplify - canonicalize constant to the RHS.
96+
std::swap(Arg0, Arg1);
97+
}
98+
99+
// Handle mul by one:
100+
if (Constant *CV1 = dyn_cast<Constant>(Arg1))
101+
if (ConstantInt *Splat =
102+
dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
103+
if (Splat->isOne())
104+
return CastInst::CreateIntegerCast(Arg0, II.getType(), IsSigned);
105+
106+
return nullptr;
107+
}
108+
109+
/// Simplify AES encryption/decryption intrinsics (AESE, AESD).
110+
///
111+
/// ARM's AES instructions (AESE/AESD) XOR the data and the key, provided as
112+
/// separate arguments, before performing the encryption/decryption operation.
113+
/// We can fold that "internal" XOR with a previous one.
114+
Instruction *simplifyAES(IntrinsicInst &II, InstCombiner &IC) {
115+
Value *DataArg = II.getArgOperand(0);
116+
Value *KeyArg = II.getArgOperand(1);
117+
118+
// Accept zero on either operand.
119+
if (!match(KeyArg, m_ZeroInt()))
120+
std::swap(KeyArg, DataArg);
121+
122+
// Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
123+
Value *Data, *Key;
124+
if (match(KeyArg, m_ZeroInt()) &&
125+
match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
126+
IC.replaceOperand(II, 0, Data);
127+
IC.replaceOperand(II, 1, Key);
128+
return &II;
129+
}
130+
131+
return nullptr;
132+
}
133+
134+
} // namespace ARMCommon
135+
} // namespace llvm

llvm/lib/Transforms/Utils/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
add_llvm_component_library(LLVMTransformUtils
22
AddDiscriminators.cpp
33
AMDGPUEmitPrintf.cpp
4+
ARMCommonInstCombineIntrinsic.cpp
45
ASanStackFrameLayout.cpp
56
AssumeBundleBuilder.cpp
67
BasicBlockUtils.cpp

llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2-
; RUN: opt -S -passes=instcombine < %s | FileCheck %s
2+
; RUN: opt --mtriple=aarch64 -S -passes=instcombine < %s | FileCheck %s
33
; ARM64 AES intrinsic variants
44

55
define <16 x i8> @combineXorAeseZeroARM64(<16 x i8> %data, <16 x i8> %key) {

llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2-
; RUN: opt -S -passes=instcombine < %s | FileCheck %s
2+
; RUN: opt -mtriple=arm -S -passes=instcombine < %s | FileCheck %s
33

44
define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp {
55
; CHECK-LABEL: define <4 x i32> @mulByZero(

llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2-
; RUN: opt -S -passes=instcombine < %s | FileCheck %s
2+
; RUN: opt -mtriple=arm -S -passes=instcombine < %s | FileCheck %s
33
; ARM AES intrinsic variants
44

55
define <16 x i8> @combineXorAeseZeroARM(<16 x i8> %data, <16 x i8> %key) {

0 commit comments

Comments
 (0)