Skip to content

Commit 68a008d

Browse files
committed
[AArch64] Allow splitting bitmasks for EOR/ORR.
This patch extends #149095 for EOR and ORR. It uses a simple partition scheme to try to find two suitable disjoint bitmasks that can be used with EOR/ORR to reconstruct the original mask. It also restructures the original code to allow reusing its infrastructure and more easily adding other partition schemes in the future.
1 parent 4f0ec22 commit 68a008d

File tree

2 files changed

+105
-44
lines changed

2 files changed

+105
-44
lines changed

llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp

Lines changed: 93 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@
88
//
99
// This pass performs below peephole optimizations on MIR level.
1010
//
11-
// 1. MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
12-
// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
11+
// 1. MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
12+
// MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
1313
//
1414
// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15-
// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
15+
// MOVi64imm + ADDXrr ==> ADDXri + ADDXri
1616
//
1717
// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
1818
// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
@@ -125,8 +125,14 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
125125
template <typename T>
126126
bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI);
127127

128+
// Strategy used to split logical immediate bitmasks.
129+
enum class SplitStrategy {
130+
Intersect,
131+
Disjoint,
132+
};
128133
template <typename T>
129-
bool visitAND(unsigned Opc, MachineInstr &MI, unsigned OtherOpc = 0);
134+
bool trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
135+
SplitStrategy Strategy, unsigned OtherOpc = 0);
130136
bool visitORR(MachineInstr &MI);
131137
bool visitCSEL(MachineInstr &MI);
132138
bool visitINSERT(MachineInstr &MI);
@@ -158,14 +164,6 @@ INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt",
158164
template <typename T>
159165
static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
160166
T UImm = static_cast<T>(Imm);
161-
if (AArch64_AM::isLogicalImmediate(UImm, RegSize))
162-
return false;
163-
164-
// If this immediate can be handled by one instruction, do not split it.
165-
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
166-
AArch64_IMM::expandMOVImm(UImm, RegSize, Insn);
167-
if (Insn.size() == 1)
168-
return false;
169167

170168
// The bitmask immediate consists of consecutive ones. Let's say there is
171169
// constant 0b00000000001000000000010000000000 which does not consist of
@@ -194,23 +192,72 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
194192
}
195193

196194
template <typename T>
197-
bool AArch64MIPeepholeOpt::visitAND(unsigned Opc, MachineInstr &MI,
198-
unsigned OtherOpc) {
199-
// Try below transformation.
195+
static bool splitDisjointBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc,
196+
T &Imm2Enc) {
197+
// Try to split a bitmask of the form 0b00000000011000000000011110000000 into
198+
// two disjoint masks such as 0b00000000011000000000000000000000 and
199+
// 0b00000000000000000000011110000000 where the inclusive/exclusive OR of the
200+
// new masks match the original mask.
201+
unsigned LowestBitSet = llvm::countr_zero(Imm);
202+
unsigned LowestGapBitUnset =
203+
LowestBitSet + llvm::countr_one(Imm >> LowestBitSet);
204+
205+
// Create a mask for the least significant group of consecutive ones.
206+
T NewImm1 = (static_cast<T>(1) << LowestGapBitUnset) -
207+
(static_cast<T>(1) << LowestBitSet);
208+
// Create a disjoint mask for the remaining ones.
209+
T NewImm2 = Imm & ~NewImm1;
210+
assert(((NewImm1 & NewImm2) == 0) && "Non-disjoint immediates!");
211+
212+
if (AArch64_AM::isLogicalImmediate(NewImm2, RegSize)) {
213+
assert(((NewImm1 | NewImm2) == Imm) && "Invalid immediates!");
214+
Imm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize);
215+
Imm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize);
216+
return true;
217+
}
218+
219+
return false;
220+
}
221+
222+
template <typename T>
223+
bool AArch64MIPeepholeOpt::trySplitLogicalImm(unsigned Opc, MachineInstr &MI,
224+
SplitStrategy Strategy,
225+
unsigned OtherOpc) {
226+
// Try the transformations below.
200227
//
201-
// MOVi32imm + ANDS?Wrr ==> ANDWri + ANDS?Wri
202-
// MOVi64imm + ANDS?Xrr ==> ANDXri + ANDS?Xri
228+
// MOVi32imm + (ANDS?|EOR|ORR)Wrr ==> (AND|EOR|ORR)Wri + (ANDS?|EOR|ORR)Wri
229+
// MOVi64imm + (ANDS?|EOR|ORR)Xrr ==> (AND|EOR|ORR)Xri + (ANDS?|EOR|ORR)Xri
203230
//
204231
// The mov pseudo instruction could be expanded to multiple mov instructions
205232
// later. Let's try to split the constant operand of mov instruction into two
206-
// bitmask immediates. It makes only two AND instructions instead of multiple
207-
// mov + and instructions.
233+
// bitmask immediates based on the given split strategy. It makes only two
234+
// logical instructions instead of multiple mov + logic instructions.
208235

209236
return splitTwoPartImm<T>(
210237
MI,
211-
[Opc, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
212-
T &Imm1) -> std::optional<OpcodePair> {
213-
if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
238+
[Opc, Strategy, OtherOpc](T Imm, unsigned RegSize, T &Imm0,
239+
T &Imm1) -> std::optional<OpcodePair> {
240+
// If this immediate is already a suitable bitmask, don't do anything.
241+
// TODO: Should we just combine the two instructions in this case?
242+
if (AArch64_AM::isLogicalImmediate(Imm, RegSize))
243+
return std::nullopt;
244+
245+
// If this immediate can be handled by one instruction, do not split it.
246+
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
247+
AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
248+
if (Insn.size() == 1)
249+
return std::nullopt;
250+
251+
bool SplitSucc = false;
252+
switch (Strategy) {
253+
case SplitStrategy::Intersect:
254+
SplitSucc = splitBitmaskImm(Imm, RegSize, Imm0, Imm1);
255+
break;
256+
case SplitStrategy::Disjoint:
257+
SplitSucc = splitDisjointBitmaskImm(Imm, RegSize, Imm0, Imm1);
258+
break;
259+
}
260+
if (SplitSucc)
214261
return std::make_pair(Opc, !OtherOpc ? Opc : OtherOpc);
215262
return std::nullopt;
216263
},
@@ -859,16 +906,36 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
859906
Changed |= visitINSERT(MI);
860907
break;
861908
case AArch64::ANDWrr:
862-
Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI);
909+
Changed |= trySplitLogicalImm<uint32_t>(AArch64::ANDWri, MI,
910+
SplitStrategy::Intersect);
863911
break;
864912
case AArch64::ANDXrr:
865-
Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI);
913+
Changed |= trySplitLogicalImm<uint64_t>(AArch64::ANDXri, MI,
914+
SplitStrategy::Intersect);
866915
break;
867916
case AArch64::ANDSWrr:
868-
Changed |= visitAND<uint32_t>(AArch64::ANDWri, MI, AArch64::ANDSWri);
917+
Changed |= trySplitLogicalImm<uint32_t>(
918+
AArch64::ANDWri, MI, SplitStrategy::Intersect, AArch64::ANDSWri);
869919
break;
870920
case AArch64::ANDSXrr:
871-
Changed |= visitAND<uint64_t>(AArch64::ANDXri, MI, AArch64::ANDSXri);
921+
Changed |= trySplitLogicalImm<uint64_t>(
922+
AArch64::ANDXri, MI, SplitStrategy::Intersect, AArch64::ANDSXri);
923+
break;
924+
case AArch64::EORWrr:
925+
Changed |= trySplitLogicalImm<uint32_t>(AArch64::EORWri, MI,
926+
SplitStrategy::Disjoint);
927+
break;
928+
case AArch64::EORXrr:
929+
Changed |= trySplitLogicalImm<uint64_t>(AArch64::EORXri, MI,
930+
SplitStrategy::Disjoint);
931+
break;
932+
case AArch64::ORRWrr:
933+
Changed |= trySplitLogicalImm<uint32_t>(AArch64::ORRWri, MI,
934+
SplitStrategy::Disjoint);
935+
break;
936+
case AArch64::ORRXrr:
937+
Changed |= trySplitLogicalImm<uint64_t>(AArch64::ORRXri, MI,
938+
SplitStrategy::Disjoint);
872939
break;
873940
case AArch64::ORRWrs:
874941
Changed |= visitORR(MI);

llvm/test/CodeGen/AArch64/aarch64-split-logic-bitmask-immediate.ll

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -375,9 +375,8 @@ entry:
375375
define i32 @test1_eor(i32 %a) {
376376
; CHECK-LABEL: test1_eor:
377377
; CHECK: // %bb.0: // %entry
378-
; CHECK-NEXT: mov w8, #1024 // =0x400
379-
; CHECK-NEXT: movk w8, #32, lsl #16
380-
; CHECK-NEXT: eor w0, w0, w8
378+
; CHECK-NEXT: eor w8, w0, #0x400
379+
; CHECK-NEXT: eor w0, w8, #0x200000
381380
; CHECK-NEXT: ret
382381
entry:
383382
%eor = xor i32 %a, 2098176
@@ -413,9 +412,8 @@ entry:
413412
define i64 @test4_eor(i64 %a) {
414413
; CHECK-LABEL: test4_eor:
415414
; CHECK: // %bb.0: // %entry
416-
; CHECK-NEXT: mov w8, #1024 // =0x400
417-
; CHECK-NEXT: movk w8, #32, lsl #16
418-
; CHECK-NEXT: eor x0, x0, x8
415+
; CHECK-NEXT: eor x8, x0, #0x400
416+
; CHECK-NEXT: eor x0, x8, #0x200000
419417
; CHECK-NEXT: ret
420418
entry:
421419
%eor = xor i64 %a, 2098176
@@ -425,9 +423,8 @@ entry:
425423
define i64 @test5_eor(i64 %a) {
426424
; CHECK-LABEL: test5_eor:
427425
; CHECK: // %bb.0: // %entry
428-
; CHECK-NEXT: mov x8, #16384 // =0x4000
429-
; CHECK-NEXT: movk x8, #2, lsl #32
430-
; CHECK-NEXT: eor x0, x0, x8
426+
; CHECK-NEXT: eor x8, x0, #0x4000
427+
; CHECK-NEXT: eor x0, x8, #0x200000000
431428
; CHECK-NEXT: ret
432429
entry:
433430
%eor = xor i64 %a, 8589950976
@@ -464,9 +461,8 @@ entry:
464461
define i32 @test1_orr(i32 %a) {
465462
; CHECK-LABEL: test1_orr:
466463
; CHECK: // %bb.0: // %entry
467-
; CHECK-NEXT: mov w8, #1024 // =0x400
468-
; CHECK-NEXT: movk w8, #32, lsl #16
469-
; CHECK-NEXT: orr w0, w0, w8
464+
; CHECK-NEXT: orr w8, w0, #0x400
465+
; CHECK-NEXT: orr w0, w8, #0x200000
470466
; CHECK-NEXT: ret
471467
entry:
472468
%orr = or i32 %a, 2098176
@@ -502,9 +498,8 @@ entry:
502498
define i64 @test4_orr(i64 %a) {
503499
; CHECK-LABEL: test4_orr:
504500
; CHECK: // %bb.0: // %entry
505-
; CHECK-NEXT: mov w8, #1024 // =0x400
506-
; CHECK-NEXT: movk w8, #32, lsl #16
507-
; CHECK-NEXT: orr x0, x0, x8
501+
; CHECK-NEXT: orr x8, x0, #0x400
502+
; CHECK-NEXT: orr x0, x8, #0x200000
508503
; CHECK-NEXT: ret
509504
entry:
510505
%orr = or i64 %a, 2098176
@@ -514,9 +509,8 @@ entry:
514509
define i64 @test5_orr(i64 %a) {
515510
; CHECK-LABEL: test5_orr:
516511
; CHECK: // %bb.0: // %entry
517-
; CHECK-NEXT: mov x8, #16384 // =0x4000
518-
; CHECK-NEXT: movk x8, #2, lsl #32
519-
; CHECK-NEXT: orr x0, x0, x8
512+
; CHECK-NEXT: orr x8, x0, #0x4000
513+
; CHECK-NEXT: orr x0, x8, #0x200000000
520514
; CHECK-NEXT: ret
521515
entry:
522516
%orr = or i64 %a, 8589950976

0 commit comments

Comments
 (0)