Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 40 additions & 18 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ class SIFoldOperandsImpl {
std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
bool tryConstantFoldOp(MachineInstr *MI) const;
bool tryFoldCndMask(MachineInstr &MI) const;
bool tryFoldZeroHighBits(MachineInstr &MI) const;
bool tryFoldArithmetic(MachineInstr &MI) const;
bool foldInstOperand(MachineInstr &MI, const FoldableDef &OpToFold) const;

bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
Expand Down Expand Up @@ -1730,26 +1730,48 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
return true;
}

bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
MI.getOpcode() != AMDGPU::V_AND_B32_e32)
return false;
bool SIFoldOperandsImpl::tryFoldArithmetic(MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();

std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(MI.getOperand(1));
if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg())
return false;
auto replaceAndFold = [this](MachineOperand &NewOp, MachineOperand &OldOp,
MachineInstr &MI) -> bool {
if (!(NewOp.isReg() && OldOp.isReg()))
return false;
Register OldReg = OldOp.getReg();
MRI->replaceRegWith(NewOp.getReg(), OldReg);
if (!OldOp.isKill())
MRI->clearKillFlags(OldReg);
MI.eraseFromParent();
return true;
};

Register Src1 = MI.getOperand(2).getReg();
MachineInstr *SrcDef = MRI->getVRegDef(Src1);
if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
switch (Opc) {
default:
return false;
case AMDGPU::V_AND_B32_e64:
case AMDGPU::V_AND_B32_e32: {
std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(MI.getOperand(1));
if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg())
return false;

Register Dst = MI.getOperand(0).getReg();
MRI->replaceRegWith(Dst, Src1);
if (!MI.getOperand(2).isKill())
MRI->clearKillFlags(Src1);
MI.eraseFromParent();
return true;
MachineOperand &Src1Op = MI.getOperand(2);
MachineInstr *SrcDef = MRI->getVRegDef(Src1Op.getReg());
if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
return false;

return replaceAndFold(MI.getOperand(0), Src1Op, MI);
}
case AMDGPU::V_ADD_U32_e64:
case AMDGPU::V_ADD_U32_e32: {
std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(MI.getOperand(1));
if (!Src0Imm || *Src0Imm != 0 || !MI.getOperand(2).isReg())
return false;

return replaceAndFold(MI.getOperand(0), MI.getOperand(2), MI);
}
}

return false;
}

bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
Expand Down Expand Up @@ -2790,7 +2812,7 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
for (auto &MI : make_early_inc_range(*MBB)) {
Changed |= tryFoldCndMask(MI);

if (tryFoldZeroHighBits(MI)) {
if (tryFoldArithmetic(MI)) {
Changed = true;
continue;
}
Expand Down
20 changes: 20 additions & 0 deletions llvm/test/CodeGen/AMDGPU/groupstaticsize-zero.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN %s

@global_smem = external addrspace(3) global [0 x i8]

define amdgpu_kernel void @addzero() {
; GCN-LABEL: addzero:
; GCN: ; %bb.0: ; %.lr.ph
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_mov_b32_e32 v3, v2
; GCN-NEXT: ds_write_b64 v0, v[2:3]
; GCN-NEXT: s_endpgm
.lr.ph:
%0 = tail call i32 @llvm.amdgcn.workitem.id.x()
%1 = and i32 %0, 1
%2 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %1
store <4 x bfloat> zeroinitializer, ptr addrspace(3) %2, align 8
ret void
}