-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[AArch64] Allow splitting bitmasks for EOR/ORR. #150394
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Allow splitting bitmasks for EOR/ORR. #150394
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Ricardo Jesus (rj-jesus) ChangesThis patch extends #149095 for EOR and ORR. It uses a simple partition scheme to try to find two suitable disjoint It also restructures the original code to allow reusing its Fixes: #148987. Full diff: https://github.com/llvm/llvm-project/pull/150394.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index abcd5505f735b..68a6f931c42d8 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -8,11 +8,11 @@
//
// This pass performs below peephole optimizations on MIR level.
//
-// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
-// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
+// 1. MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
+// MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
//
// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
-// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
+// MOVi64imm + ADDXrr ==> ADDXri + ADDXri
//
// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
@@ -125,8 +125,14 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
template <typename T>
bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
+ // Strategy used to split logical immediate bitmasks.
+ enum class SplitStrategy {
+ Intersect,
+ Disjoint,
+ };
template <typename T>
- bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0);
+ bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
+ SplitStrategy Strategy, unsigned OtherOpc = 0);
bool visitORR(MachineInstr &MI);
bool visitCSEL(MachineInstr &MI);
bool visitINSERT(MachineInstr &MI);
@@ -158,14 +164,6 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
template <typename T>
static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
T UImm = static_cast<T>(Imm);
- if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
- return false;
-
- // If this immediate can be handled by one instruction, do not split it.
- SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
- AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
- if (Insn.size() == 1)
- return false;
// The bitmask immediate consists of consecutive ones. Let's say there is
// constant 0b00000000001000000000010000000000 which does not consist of
@@ -194,23 +192,72 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
}
template <typename T>
-bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI,
- unsigned OtherOpc) {
- // Try below transformation.
+static bool splitDisjointBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc,
+ T &Imm2Enc) {
+ // Try to split a bitmask of the form 0b00000000011000000000011110000000 into
+ // two disjoint masks such as 0b00000000011000000000000000000000 and
+ // 0b00000000000000000000011110000000 where the inclusive/exclusive OR of the
+ // new masks match the original mask.
+ unsigned LowestBitSet = llvm::countr_zero(Imm);
+ unsigned LowestGapBitUnset =
+ LowestBitSet + llvm::countr_one(Imm >> LowestBitSet);
+
+ // Create a mask for the least significant group of consecutive ones.
+ T NewImm1 = (static_cast<T>(1) << LowestGapBitUnset) -
+ (static_cast<T>(1) << LowestBitSet);
+ // Create a disjoint mask for the remaining ones.
+ T NewImm2 = Imm & ~NewImm1;
+ assert(((NewImm1 & NewImm2) == 0) && "Non-disjoint immediates!");
+
+ if (AArch64_AM::isLogicalImmediate(NewImm2, RegSize)) {
+ assert(((NewImm1 | NewImm2) == Imm) && "Invalid immediates!");
+ Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
+ Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
+ return true;
+ }
+
+ return false;
+}
+
+template <typename T>
+bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
+ SplitStrategy Strategy,
+ unsigned OtherOpc) {
+ // Try the transformations below.
//
- // MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
- // MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
+ // MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
+ // MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
//
// The mov pseudo instruction could be expanded to multiple mov instructions
// later. Let's try to split the constant operand of mov instruction into two
- // bitmask immediates. It makes only two AND instructions instead of multiple
- // mov + and instructions.
+ // bitmask immediates based on the given split strategy. It makes only two
+ // logical instructions instead of multiple mov + logic instructions.
return splitTwoPartImm<T>(
MI,
- [Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
- T &Imm1) -> std::optional<OpcodePair> {
- if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
+ [Opc, Strategy, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
+ T &Imm1) -> std::optional<OpcodePair> {
+ // If this immediate is already a suitable bitmask, don't do anything.
+ // TODO: Should we just combine the two instructions in this case?
+ if (AArch64_AM::isLogicalImmediate(Imm, RegSize))
+ return std::nullopt;
+
+ // If this immediate can be handled by one instruction, do not split it.
+ SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
+ AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
+ if (Insn.size() == 1)
+ return std::nullopt;
+
+ bool SplitSucc = false;
+ switch (Strategy) {
+ case SplitStrategy::Intersect:
+ SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1);
+ break;
+ case SplitStrategy::Disjoint:
+ SplitSucc = splitDisjointBitmaskImm(Imm, RegSize, Imm0, Imm1);
+ break;
+ }
+ if (SplitSucc)
return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
return std::nullopt;
},
@@ -859,16 +906,36 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
Changed |= visitINSERT(MI);
break;
case AArch64::ANDWrr:
- Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
+ Changed |= trySplitLogicalImm<uint32_t>(AArch64::ANDWri, MI,
+ SplitStrategy::Intersect);
break;
case AArch64::ANDXrr:
- Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
+ Changed |= trySplitLogicalImm<uint64_t>(AArch64::ANDXri, MI,
+ SplitStrategy::Intersect);
break;
case AArch64::ANDSWrr:
- Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri);
+ Changed |= trySplitLogicalImm<uint32_t>(
+ AArch64::ANDWri, MI, SplitStrategy::Intersect, AArch64::ANDSWri);
break;
case AArch64::ANDSXrr:
- Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri);
+ Changed |= trySplitLogicalImm<uint64_t>(
+ AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri);
+ break;
+ case AArch64::EORWrr:
+ Changed |= trySplitLogicalImm<uint32_t>(AArch64::EORWri, MI,
+ SplitStrategy::Disjoint);
+ break;
+ case AArch64::EORXrr:
+ Changed |= trySplitLogicalImm<uint64_t>(AArch64::EORXri, MI,
+ SplitStrategy::Disjoint);
+ break;
+ case AArch64::ORRWrr:
+ Changed |= trySplitLogicalImm<uint32_t>(AArch64::ORRWri, MI,
+ SplitStrategy::Disjoint);
+ break;
+ case AArch64::ORRXrr:
+ Changed |= trySplitLogicalImm<uint64_t>(AArch64::ORRXri, MI,
+ SplitStrategy::Disjoint);
break;
case AArch64::ORRWrs:
Changed |= visitORR(MI);
diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
similarity index 71%
rename from llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
rename to llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
index 113eb14ca4803..4db9db9185206 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll
@@ -370,3 +370,175 @@ entry:
%r = select i1 %c, i64 %a, i64 %ands
ret i64 %r
}
+
+; Test EOR.
+define i32 @test1_eor(i32 %a) {
+; CHECK-LABEL: test1_eor:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: eor w8, w0, #0x400
+; CHECK-NEXT: eor w0, w8, #0x200000
+; CHECK-NEXT: ret
+entry:
+ %eor = xor i32 %a, 2098176
+ ret i32 %eor
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i32 @test2_eor(i32 %a) {
+; CHECK-LABEL: test2_eor:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #135 // =0x87
+; CHECK-NEXT: eor w0, w0, w8
+; CHECK-NEXT: ret
+entry:
+ %eor = xor i32 %a, 135
+ ret i32 %eor
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i32 @test3_eor(i32 %a) {
+; CHECK-LABEL: test3_eor:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: movk w8, #33, lsl #16
+; CHECK-NEXT: eor w0, w0, w8
+; CHECK-NEXT: ret
+entry:
+ %eor = xor i32 %a, 2163712
+ ret i32 %eor
+}
+
+define i64 @test4_eor(i64 %a) {
+; CHECK-LABEL: test4_eor:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: eor x8, x0, #0x400
+; CHECK-NEXT: eor x0, x8, #0x200000
+; CHECK-NEXT: ret
+entry:
+ %eor = xor i64 %a, 2098176
+ ret i64 %eor
+}
+
+define i64 @test5_eor(i64 %a) {
+; CHECK-LABEL: test5_eor:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: eor x8, x0, #0x4000
+; CHECK-NEXT: eor x0, x8, #0x200000000
+; CHECK-NEXT: ret
+entry:
+ %eor = xor i64 %a, 8589950976
+ ret i64 %eor
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i64 @test6_eor(i64 %a) {
+; CHECK-LABEL: test6_eor:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #135 // =0x87
+; CHECK-NEXT: eor x0, x0, x8
+; CHECK-NEXT: ret
+entry:
+ %eor = xor i64 %a, 135
+ ret i64 %eor
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i64 @test7_eor(i64 %a) {
+; CHECK-LABEL: test7_eor:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: movk w8, #33, lsl #16
+; CHECK-NEXT: eor x0, x0, x8
+; CHECK-NEXT: ret
+entry:
+ %eor = xor i64 %a, 2163712
+ ret i64 %eor
+}
+
+; Test ORR.
+define i32 @test1_orr(i32 %a) {
+; CHECK-LABEL: test1_orr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: orr w8, w0, #0x400
+; CHECK-NEXT: orr w0, w8, #0x200000
+; CHECK-NEXT: ret
+entry:
+ %orr = or i32 %a, 2098176
+ ret i32 %orr
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i32 @test2_orr(i32 %a) {
+; CHECK-LABEL: test2_orr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #135 // =0x87
+; CHECK-NEXT: orr w0, w0, w8
+; CHECK-NEXT: ret
+entry:
+ %orr = or i32 %a, 135
+ ret i32 %orr
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i32 @test3_orr(i32 %a) {
+; CHECK-LABEL: test3_orr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: movk w8, #33, lsl #16
+; CHECK-NEXT: orr w0, w0, w8
+; CHECK-NEXT: ret
+entry:
+ %orr = or i32 %a, 2163712
+ ret i32 %orr
+}
+
+define i64 @test4_orr(i64 %a) {
+; CHECK-LABEL: test4_orr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: orr x8, x0, #0x400
+; CHECK-NEXT: orr x0, x8, #0x200000
+; CHECK-NEXT: ret
+entry:
+ %orr = or i64 %a, 2098176
+ ret i64 %orr
+}
+
+define i64 @test5_orr(i64 %a) {
+; CHECK-LABEL: test5_orr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: orr x8, x0, #0x4000
+; CHECK-NEXT: orr x0, x8, #0x200000000
+; CHECK-NEXT: ret
+entry:
+ %orr = or i64 %a, 8589950976
+ ret i64 %orr
+}
+
+; This constant should not be split because it can be handled by one mov.
+define i64 @test6_orr(i64 %a) {
+; CHECK-LABEL: test6_orr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #135 // =0x87
+; CHECK-NEXT: orr x0, x0, x8
+; CHECK-NEXT: ret
+entry:
+ %orr = or i64 %a, 135
+ ret i64 %orr
+}
+
+; This constant should not be split because the split immediate is not valid
+; bitmask immediate.
+define i64 @test7_orr(i64 %a) {
+; CHECK-LABEL: test7_orr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #1024 // =0x400
+; CHECK-NEXT: movk w8, #33, lsl #16
+; CHECK-NEXT: orr x0, x0, x8
+; CHECK-NEXT: ret
+entry:
+ %orr = or i64 %a, 2163712
+ ret i64 %orr
+}
|
This patch generalises the logic for splitting bitmasks for AND/ANDS immediate instructions, to prepare it to handle more instructions as in llvm#150394.
This patch generalises the logic for splitting bitmasks for AND/ANDS immediate instructions, to prepare it to handle more opcodes, as in #150394.
68a008d
to
2c933ed
Compare
Rebased on top of #150619. |
This patch extends llvm#149095 for EOR and ORR. It uses a simple partition scheme to try to find two suitable disjoint bitmasks that can be used with EOR/ORR to reconstruct the original mask.
4030752
to
9627500
Compare
Rebased to attempt to sort out the CI failures on |
Thank you so much! I have been trying to figure out how to do this for while but then other PRs got caught up. out of curiosity, have you read https://devblogs.microsoft.com/oldnewthing/20220808-00/?p=106953 because it also talks about this |
For what it is worth, I wonder if the assembler can take care of many of the situations for us where we can materialize a constant via adds and subs + zero reg or we have to do that ourselves. |
For example, if we wanted to materialize a number that had the form: 0xFFFFFFFF`FFFFFXXX, We can do sub Xd, xzr, #imm12 |
This patch generalises the logic for splitting bitmasks for AND/ANDS immediate instructions, to prepare it to handle more opcodes, as in llvm#150394.
This patch extends #149095 for EOR and ORR.
It uses a simple partition scheme to try to find two suitable disjoint
bitmasks that can be used with EOR/ORR to reconstruct the original mask.
Fixes: #148987.