-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU][SIFoldOperand] Hoist readlane through some instructions #129687
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
SIFoldOperand version of llvm#129037 Handles a limited amount of opcodes because going from VALU to SALU isn't trivial, and we don't have a helper for it. I looked at our test suite and added all opcodes that were eligible and appeared as v_read(first)lane operands.
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesSIFoldOperand version of #129037 Handles a limited amount of opcodes because going from VALU to SALU isn't trivial, and we don't have a helper for it. Patch is 155.32 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/129687.diff 14 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index eb9aabf8b6317..67832cfc0c571 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -126,6 +126,7 @@ class SIFoldOperandsImpl {
std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
bool tryConstantFoldOp(MachineInstr *MI) const;
bool tryFoldCndMask(MachineInstr &MI) const;
+ bool tryScalarizeReadLaneSrc(MachineInstr &MI) const;
bool tryFoldZeroHighBits(MachineInstr &MI) const;
bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
@@ -1407,6 +1408,148 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
return true;
}
+static unsigned
+getScalarizedReadLaneSrcOpc(const GCNSubtarget &ST, unsigned Opc,
+ SmallVectorImpl<MachineOperand *> &Ops) {
+ // Opcodes here are added as-needed because there are hundreds of
+ // instructions we could convert, but realistically we only need
+ // the most frequent ones to make an impact.
+ //
+ // The InstCombine version of this transform will do the heavy
+ // lifting, this is just a cleanup for the readlanes added during
+ // lowering.
+ switch (Opc) {
+ case AMDGPU::V_OR_B32_e32:
+ case AMDGPU::V_OR_B32_e64:
+ return AMDGPU::S_OR_B32;
+ case AMDGPU::V_MUL_HI_U32_e64:
+ if (ST.getGeneration() >= GCNSubtarget::GFX9)
+ return AMDGPU::S_MUL_HI_U32;
+ break;
+ case AMDGPU::V_AND_B32_e32:
+ case AMDGPU::V_AND_B32_e64:
+ return AMDGPU::S_AND_B32;
+ case AMDGPU::V_LSHRREV_B32_e32: // dst = S1 >> S0
+ case AMDGPU::V_LSHRREV_B32_e64:
+ std::swap(Ops[0], Ops[1]); // dst = S0 >> S1 (!)
+ return AMDGPU::S_LSHR_B32;
+ case AMDGPU::V_CVT_U32_F32_e32:
+ case AMDGPU::V_CVT_U32_F32_e64:
+ if (ST.hasSALUFloatInsts())
+ return AMDGPU::S_CVT_U32_F32;
+ break;
+ case AMDGPU::V_MIN_U32_e32:
+ case AMDGPU::V_MIN_U32_e64:
+ return AMDGPU::S_MIN_U32;
+ case AMDGPU::V_MIN_I32_e32:
+ case AMDGPU::V_MIN_I32_e64:
+ return AMDGPU::S_MIN_I32;
+ case AMDGPU::V_MAX_U32_e32:
+ case AMDGPU::V_MAX_U32_e64:
+ return AMDGPU::S_MAX_U32;
+ case AMDGPU::V_MAX_I32_e32:
+ case AMDGPU::V_MAX_I32_e64:
+ return AMDGPU::S_MAX_I32;
+ default:
+ break;
+ }
+
+ return -1;
+}
+
+// Try to transform
+// %0:vgpr = (valu op) %x:vgpr
+// %1:sgpr = v_readfirstlane %0
+// Into
+// %0:sgpr = v_readfirstlane %x:vgpr
+// %1:sgpr = (salu op) %0
+bool SIFoldOperandsImpl::tryScalarizeReadLaneSrc(MachineInstr &MI) const {
+ const unsigned Opc = MI.getOpcode();
+ if (Opc != AMDGPU::V_READFIRSTLANE_B32 && Opc != AMDGPU::V_READLANE_B32)
+ return false;
+
+ const auto VSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ const Register VSrc = MI.getOperand(VSrcIdx).getReg();
+
+ if (!MRI->hasOneNonDBGUse(VSrc))
+ return false;
+
+ MachineInstr *VSrcDef = MRI->getVRegDef(VSrc);
+ // Need a unary or binary VALU instruction as operand.
+ if (!VSrcDef || (VSrcDef->getParent() != MI.getParent()) ||
+ !TII->isVALU(*VSrcDef) || VSrcDef->getNumExplicitOperands() > 3 ||
+ execMayBeModifiedBeforeUse(*MRI, VSrc, *VSrcDef, MI))
+ return false;
+
+ const bool IsReadLane = (Opc == AMDGPU::V_READLANE_B32);
+ if (IsReadLane) {
+ MachineOperand &LaneOp = MI.getOperand(2);
+ if (LaneOp.isReg()) { // Can the lane be an imm?
+ Register LaneReg = LaneOp.getReg();
+ for (auto It = VSrcDef->getIterator(); It != MI.getIterator(); ++It) {
+ if (It->modifiesRegister(LaneReg, TRI))
+ return false;
+ }
+ }
+ }
+
+ SmallVector<MachineOperand *, 2> Ops;
+ MachineOperand *TargetOp = nullptr;
+ for (MachineOperand &SrcOp : VSrcDef->operands()) {
+ if (SrcOp.isReg()) {
+ if (SrcOp.isImplicit() || SrcOp.isDef())
+ continue;
+
+ Ops.push_back(&SrcOp);
+
+ Register Reg = SrcOp.getReg();
+ if (TRI->isVectorRegister(*MRI, Reg)) {
+ // This only works if we have one VGPR src.
+ if (TargetOp)
+ return false;
+ TargetOp = &SrcOp;
+ }
+ } else {
+ Ops.push_back(&SrcOp); // also collect imms
+ }
+ }
+ if (!TargetOp)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "tryScalarizeReadLaneSrc:\n\treadlane: " << MI
+ << "\tsrc: " << *VSrcDef << "\top: " << *TargetOp << "\n");
+
+ const unsigned ScalarOp =
+ getScalarizedReadLaneSrcOpc(*ST, VSrcDef->getOpcode(), Ops);
+ if (ScalarOp == unsigned(-1))
+ return false;
+
+ // We only support unary/binary ops.
+ assert(Ops.size() <= 2);
+
+ MachineBasicBlock *MBB = VSrcDef->getParent();
+ auto InsertBefore = VSrcDef->getIterator();
+ const DebugLoc &DL = VSrcDef->getDebugLoc();
+ Register SDst = MI.getOperand(0).getReg();
+
+ Register STargetOp = MRI->createVirtualRegister(MRI->getRegClass(SDst));
+ auto NewMI = BuildMI(*MBB, InsertBefore, DL, MI.getDesc(), STargetOp)
+ .addReg(TargetOp->getReg());
+ if (IsReadLane)
+ NewMI.add(MI.getOperand(2)); // lane index
+ auto ScalarMI = BuildMI(*MBB, InsertBefore, DL, TII->get(ScalarOp), SDst);
+ for (MachineOperand *Op : Ops) {
+ if (Op == TargetOp)
+ ScalarMI.addReg(STargetOp);
+ else
+ ScalarMI.add(*Op);
+ }
+
+ VSrcDef->eraseFromParent();
+ MI.eraseFromParent();
+ return true;
+}
+
bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
MI.getOpcode() != AMDGPU::V_AND_B32_e32)
@@ -2353,6 +2496,11 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
for (auto &MI : make_early_inc_range(*MBB)) {
Changed |= tryFoldCndMask(MI);
+ if (tryScalarizeReadLaneSrc(MI)) {
+ Changed = true;
+ continue;
+ }
+
if (tryFoldZeroHighBits(MI)) {
Changed = true;
continue;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 2389924b82484..76cdbaa661579 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -730,18 +730,18 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX6-NEXT: s_lshl_b32 s3, s4, 24
; GFX6-NEXT: s_lshl_b32 s4, s7, 24
; GFX6-NEXT: s_not_b32 s5, s3
-; GFX6-NEXT: s_min_u32 s4, s5, s4
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
+; GFX6-NEXT: s_min_u32 s4, s5, s4
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
; GFX6-NEXT: s_add_i32 s3, s3, s4
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: s_lshr_b32 s3, s3, 24
; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
+; GFX6-NEXT: s_lshr_b32 s3, s3, 24
; GFX6-NEXT: s_lshl_b32 s0, s2, 16
-; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX6-NEXT: s_lshl_b32 s0, s3, 24
-; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: v_readfirstlane_b32 s1, v0
+; GFX6-NEXT: s_or_b32 s0, s1, s0
+; GFX6-NEXT: s_lshl_b32 s1, s3, 24
+; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_uaddsat_v4i8:
@@ -1020,8 +1020,8 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_lshr_b32 s0, s0, 8
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddsat_i24:
@@ -1030,8 +1030,8 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 8
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_uaddsat_i24:
@@ -1039,8 +1039,8 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 8
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i24 @llvm.uadd.sat.i24(i24 %lhs, i24 %rhs)
ret i24 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 34d36581a21db..0311e0fb4d68c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -714,18 +714,18 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX6-NEXT: s_sub_i32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s4, 24
; GFX6-NEXT: s_lshl_b32 s4, s7, 24
-; GFX6-NEXT: s_min_u32 s4, s3, s4
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
+; GFX6-NEXT: s_min_u32 s4, s3, s4
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
; GFX6-NEXT: s_sub_i32 s3, s3, s4
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: s_lshr_b32 s3, s3, 24
; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
+; GFX6-NEXT: s_lshr_b32 s3, s3, 24
; GFX6-NEXT: s_lshl_b32 s0, s2, 16
-; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX6-NEXT: s_lshl_b32 s0, s3, 24
-; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: v_readfirstlane_b32 s1, v0
+; GFX6-NEXT: s_or_b32 s0, s1, s0
+; GFX6-NEXT: s_lshl_b32 s1, s3, 24
+; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_usubsat_v4i8:
@@ -1002,8 +1002,8 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_lshr_b32 s0, s0, 8
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubsat_i24:
@@ -1012,8 +1012,8 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 8
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_usubsat_i24:
@@ -1021,8 +1021,8 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 8
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs)
ret i24 %result
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 3737cc414c58f..39f9bd3768a42 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -7180,10 +7180,10 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6
+; GFX7LESS-NEXT: s_and_b32 s4, s4, 0xff
+; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0
; GFX7LESS-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
@@ -7214,10 +7214,10 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
; GFX8-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX8-NEXT: .LBB12_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: s_and_b32 s4, s4, 0xff
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
@@ -7251,10 +7251,10 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
; GFX9-NEXT: v_lshrrev_b32_e32 v0, s2, v0
; GFX9-NEXT: .LBB12_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: s_and_b32 s4, s4, 0xff
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -7289,11 +7289,11 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s7, v0
; GFX1064-NEXT: .LBB12_2:
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc
+; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: s_and_b32 s2, s2, 0xff
; GFX1064-NEXT: v_or_b32_e32 v0, s2, v0
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: buffer_store_byte v0, off, s[0:3], 0
@@ -7325,11 +7325,11 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s5, v0
; GFX1032-NEXT: .LBB12_2:
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc_lo
+; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT: s_and_b32 s2, s2, 0xff
; GFX1032-NEXT: v_or_b32_e32 v0, s2, v0
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: buffer_store_byte v0, off, s[0:3], 0
@@ -7363,13 +7363,13 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
; GFX1164-NEXT: v_lshrrev_b32_e32 v0, s7, v0
; GFX1164-NEXT: .LBB12_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_and_b32 s2, s2, 0xff
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_or_b32_e32 v0, s2, v0
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: buffer_store_b8 v0, off, s[0:3], 0
@@ -7402,13 +7402,13 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
; GFX1132-NEXT: v_lshrrev_b32_e32 v0, s6, v0
; GFX1132-NEXT: .LBB12_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_cndmask_b32_e64 v0, s4, 0, vcc_lo
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_and_b32 s2, s2, 0xff
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_or_b32_e32 v0, s2, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: buffer_store_b8 v0, off, s[0:3], 0
@@ -7442,13 +7442,13 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
; GFX1264-NEXT: v_lshrrev_b32_e32 v0, s7, v0
; GFX1264-NEXT: .LBB12_2:
; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
; GFX1264-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_and_b32 s2, s2, 0xff
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1264-NEXT: v_or_b32_e32 v0, s2, v0
; GFX1264-NEXT: s_mov_b32 s2, -1
; GFX1264-NEXT: buffer_store_b8 v0, off, s[0:3], null
@@ -7481,13 +7481,13 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
; GFX1232-NEXT: v_lshrrev_b32_e32 v0, s6, v0
; GFX1232-NEXT: .LBB12_2:
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1232-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-NEXT: v_cndmask_b32_e64 v0, s4, 0, vcc_lo
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_and_b32 s2, s2, 0xff
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1232-NEXT: v_or_b32_e32 v0, s2, v0
; GFX1232-NEXT: s_mov_b32 s2, -1
; GFX1232-NEXT: buffer_store_b8 v0, off, s[0:3], null
@@ -7551,8 +7551,8 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7LESS-NEXT: s_and_b32 s4, s4, 0xff
; GFX7LESS-NEXT: s_and_b32 s5, s10, 0xff
; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4
; GFX7LESS-NEXT: v_mad_u32_u24 v0, s5, v4, v0
@@ -7608,8 +7608,8 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX8-NEXT: v_lshrrev_b32_e32 v0, s11, v2
; GFX8-NEXT: .LBB13_4: ; %Flow
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_and_b32 s4, s4, 0xff
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s3, 0xf000
@@ -7666,8 +7666,8 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX9-NEXT: v_lshrrev_b32_e32 v0, s11, v2
; GFX9-NEXT: .LBB13_4: ; %Flow
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_and_b32 s4, s4, 0xff
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s3, 0xf000
@@ -7725,10 +7725,10 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s11, v2
; GFX1064-NEXT: .LBB13_4: ; %Flow
; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX1064-NEXT: v_and_b32_e32 v0, 0xff, v0
...
[truncated]
|
|
This is still a bit of a WIP. It could land as-is but I think adding a few more opcodes wouldn't hurt. I also think making this recursive so it can scalarize chains of instructions wouldn't hurt either. However before I keep going on this I'd like some feedback on the direction/usefulness of this, hence why I'm asking for a review now |
SIFoldOperand version of #129037
Handles a limited amount of opcodes because going from VALU to SALU isn't trivial, and we don't have a helper for it.
I looked at our test suite and added all opcodes that were eligible and appeared as v_read(first)lane operands.