Skip to content

Commit b5bab0f

Browse files
author
Baoshan Pang
committed
[AMDGPU] add s_bitset[10]_b32 optimization for shl+[or,andn2] pattern
1 parent e038c54 commit b5bab0f

File tree

6 files changed

+201
-92
lines changed

6 files changed

+201
-92
lines changed

llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "Utils/AMDGPUBaseInfo.h"
1616
#include "llvm/ADT/Statistic.h"
1717
#include "llvm/CodeGen/MachineFunctionPass.h"
18+
#include "llvm/CodeGen/MachineOperand.h"
1819

1920
#define DEBUG_TYPE "si-shrink-instructions"
2021

@@ -44,6 +45,7 @@ class SIShrinkInstructions {
4445
void shrinkMIMG(MachineInstr &MI) const;
4546
void shrinkMadFma(MachineInstr &MI) const;
4647
bool shrinkScalarLogicOp(MachineInstr &MI) const;
48+
bool shrinkToBitset(MachineInstr &MI) const;
4749
bool tryReplaceDeadSDST(MachineInstr &MI) const;
4850
bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
4951
Register Reg, unsigned SubReg) const;
@@ -577,8 +579,7 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
577579
const bool IsUndef = SrcReg->isUndef();
578580
const bool IsKill = SrcReg->isKill();
579581
MI.setDesc(TII->get(Opc));
580-
if (Opc == AMDGPU::S_BITSET0_B32 ||
581-
Opc == AMDGPU::S_BITSET1_B32) {
582+
if (Opc == AMDGPU::S_BITSET0_B32 || Opc == AMDGPU::S_BITSET1_B32) {
582583
Src0->ChangeToImmediate(NewImm);
583584
// Remove the immediate and add the tied input.
584585
MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false,
@@ -594,6 +595,64 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
594595
return false;
595596
}
596597

598+
// case 1:
599+
// From:
600+
// s_lshl_b32 s1, 1, s1
601+
// s_or_b32 s0, s0, s1
602+
// To:
603+
// s_bitset1_b32 s0, s1
604+
//
605+
// case 2:
606+
// s_lshl_b32 s1, 1, s1
607+
// s_andn2_b32 s0, s0, s1
608+
// To:
609+
// s_bitset0_b32 s0, s1
610+
bool SIShrinkInstructions::shrinkToBitset(MachineInstr &MI) const {
611+
unsigned Opc = MI.getOpcode();
612+
const MachineOperand *Dest = &MI.getOperand(0);
613+
MachineOperand *Src0 = &MI.getOperand(1);
614+
MachineOperand *Src1 = &MI.getOperand(2);
615+
616+
if (Src0->isReg() && Src1->isReg() && Dest->getReg() == Src0->getReg()) {
617+
MachineInstr *Shl = MRI->getUniqueVRegDef(Src1->getReg());
618+
if (Shl && Shl->getOpcode() == AMDGPU::S_LSHL_B32 &&
619+
Shl->getOperand(1).isImm() && Shl->getOperand(1).getImm() == 1 &&
620+
MI.getParent() == Shl->getParent()) {
621+
int ShlDestReg = Shl->getOperand(0).getReg();
622+
int ShlSrc1Reg = Shl->getOperand(2).getReg();
623+
624+
if (MRI->hasAtMostUserInstrs(ShlDestReg, 2)) {
625+
bool IsKilled = false;
626+
for (auto IE = MI.getIterator(), I = std::next(Shl->getIterator());
627+
I != IE; ++I) {
628+
for (MachineOperand &MO : I->operands()) {
629+
if (MO.isReg() && MO.getReg() == ShlSrc1Reg) {
630+
if (MO.isDef())
631+
return true;
632+
if (MO.isKill()) {
633+
MO.setIsKill(false);
634+
IsKilled = true;
635+
}
636+
}
637+
}
638+
}
639+
unsigned int NewOpc = (Opc == AMDGPU::S_OR_B32) ? AMDGPU::S_BITSET1_B32
640+
: AMDGPU::S_BITSET0_B32;
641+
MI.setDesc(TII->get(NewOpc));
642+
Src0->setReg(ShlSrc1Reg);
643+
Src0->setIsKill(IsKilled);
644+
MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false,
645+
/*isImp*/ false, Src0->isKill(),
646+
/*isDead*/ false, Src0->isUndef());
647+
MI.tieOperands(0, 2);
648+
Shl->eraseFromParent();
649+
}
650+
}
651+
return true;
652+
}
653+
return false;
654+
}
655+
597656
// This is the same as MachineInstr::readsRegister/modifiesRegister except
598657
// it takes subregs into account.
599658
bool SIShrinkInstructions::instAccessReg(
@@ -951,6 +1010,12 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
9511010
continue;
9521011
}
9531012

1013+
if (MI.getOpcode() == AMDGPU::S_ANDN2_B32 ||
1014+
MI.getOpcode() == AMDGPU::S_OR_B32) {
1015+
if (shrinkToBitset(MI))
1016+
continue;
1017+
}
1018+
9541019
if (TII->isMIMG(MI.getOpcode()) &&
9551020
ST->getGeneration() >= AMDGPUSubtarget::GFX10 &&
9561021
MF.getProperties().hasProperty(

llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -728,9 +728,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
728728
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
729729
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
730730
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
731-
; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
732731
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
733-
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
732+
; GFX10W32-NEXT: s_bitset0_b32 s1, s2
734733
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
735734
; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
736735
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
@@ -813,9 +812,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
813812
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
814813
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
815814
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
816-
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
817815
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
818-
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
816+
; GFX11W32-NEXT: s_bitset0_b32 s1, s2
819817
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
820818
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
821819
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
@@ -898,9 +896,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
898896
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
899897
; GFX12W32-NEXT: s_wait_alu 0xfffe
900898
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
901-
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
902899
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
903-
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
900+
; GFX12W32-NEXT: s_bitset0_b32 s1, s2
904901
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
905902
; GFX12W32-NEXT: s_wait_alu 0xfffe
906903
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
@@ -1120,9 +1117,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
11201117
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
11211118
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
11221119
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
1123-
; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
11241120
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
1125-
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
1121+
; GFX10W32-NEXT: s_bitset0_b32 s1, s2
11261122
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
11271123
; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
11281124
; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1211,9 +1207,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
12111207
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
12121208
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
12131209
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
1214-
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
12151210
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
1216-
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
1211+
; GFX11W32-NEXT: s_bitset0_b32 s1, s2
12171212
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
12181213
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
12191214
; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1301,9 +1296,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
13011296
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
13021297
; GFX12W32-NEXT: s_wait_alu 0xfffe
13031298
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
1304-
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
13051299
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
1306-
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
1300+
; GFX12W32-NEXT: s_bitset0_b32 s1, s2
13071301
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
13081302
; GFX12W32-NEXT: s_wait_alu 0xfffe
13091303
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
@@ -2183,9 +2177,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
21832177
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
21842178
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
21852179
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
2186-
; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
21872180
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
2188-
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
2181+
; GFX10W32-NEXT: s_bitset0_b32 s1, s2
21892182
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
21902183
; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
21912184
; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2268,9 +2261,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
22682261
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
22692262
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
22702263
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
2271-
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
22722264
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
2273-
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
2265+
; GFX11W32-NEXT: s_bitset0_b32 s1, s2
22742266
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
22752267
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
22762268
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2354,9 +2346,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
23542346
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
23552347
; GFX12W32-NEXT: s_wait_alu 0xfffe
23562348
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
2357-
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
23582349
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
2359-
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
2350+
; GFX12W32-NEXT: s_bitset0_b32 s1, s2
23602351
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
23612352
; GFX12W32-NEXT: s_wait_alu 0xfffe
23622353
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0

0 commit comments

Comments
 (0)