Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUCombine.td
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,12 @@ def int_minmax_to_med3 : GICombineRule<
[{ return matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
(apply [{ applyMed3(*${min_or_max}, ${matchinfo}); }])>;

let Predicates = [Predicate<"Subtarget->d16PreservesUnusedBits()">] in
def d16_load : GICombineRule<
(defs root:$bitcast),
(combine (G_BITCAST $dst, $src):$bitcast,
[{ return combineD16Load(*${bitcast} ); }])>;

def fp_minmax_to_med3 : GICombineRule<
(defs root:$min_or_max, med3_matchdata:$matchinfo),
(match (wip_match_opcode G_FMAXNUM,
Expand Down Expand Up @@ -219,5 +225,6 @@ def AMDGPURegBankCombiner : GICombiner<
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
identity_combines, redundant_and, constant_fold_cast_op,
cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> {
cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
d16_load]> {
}
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,13 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>;

def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO, SIload_d16_lo>;
def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO_U8, SIload_d16_lo_u8>;
def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO_I8, SIload_d16_lo_i8>;
def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI, SIload_d16_hi>;
def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI_U8, SIload_d16_hi_u8>;
def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI_I8, SIload_d16_hi_i8>;

def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
// G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
// so we don't mark it as equivalent.
Expand Down
86 changes: 86 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ class AMDGPURegBankCombinerImpl : public Combiner {

void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const;

bool combineD16Load(MachineInstr &MI) const;
bool applyD16Load(unsigned D16Opc, MachineInstr &DstMI,
MachineInstr *SmallLoad, Register ToOverwriteD16) const;

private:
SIModeRegisterDefaults getMode() const;
bool getIEEE() const;
Expand Down Expand Up @@ -392,6 +396,88 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
MI.eraseFromParent();
}

bool AMDGPURegBankCombinerImpl::combineD16Load(MachineInstr &MI) const {
Register Dst;
MachineInstr *Load, *SextLoad;
const int64_t CleanLo16 = 0xFFFFFFFFFFFF0000;
const int64_t CleanHi16 = 0x000000000000FFFF;

// Load lo
if (mi_match(MI.getOperand(1).getReg(), MRI,
m_GOr(m_GAnd(m_GBitcast(m_Reg(Dst)),
m_Copy(m_SpecificICst(CleanLo16))),
m_MInstr(Load)))) {

if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) {
const MachineMemOperand *MMO = *Load->memoperands_begin();
unsigned LoadSize = MMO->getSizeInBits().getValue();
if (LoadSize == 8)
return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_LO_U8, MI, Load, Dst);
if (LoadSize == 16)
return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_LO, MI, Load, Dst);
return false;
}

if (mi_match(
Load, MRI,
m_GAnd(m_MInstr(SextLoad), m_Copy(m_SpecificICst(CleanHi16))))) {
if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD)
return false;

const MachineMemOperand *MMO = *SextLoad->memoperands_begin();
if (MMO->getSizeInBits().getValue() != 8)
return false;

return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_LO_I8, MI, SextLoad, Dst);
}

return false;
}

// Load hi
if (mi_match(MI.getOperand(1).getReg(), MRI,
m_GOr(m_GAnd(m_GBitcast(m_Reg(Dst)),
m_Copy(m_SpecificICst(CleanHi16))),
m_GShl(m_MInstr(Load), m_Copy(m_SpecificICst(16)))))) {

if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) {
const MachineMemOperand *MMO = *Load->memoperands_begin();
unsigned LoadSize = MMO->getSizeInBits().getValue();
if (LoadSize == 8)
return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_HI_U8, MI, Load, Dst);
if (LoadSize == 16)
return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_HI, MI, Load, Dst);
return false;
}

if (mi_match(
Load, MRI,
m_GAnd(m_MInstr(SextLoad), m_Copy(m_SpecificICst(CleanHi16))))) {
if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD)
return false;
const MachineMemOperand *MMO = *SextLoad->memoperands_begin();
if (MMO->getSizeInBits().getValue() != 8)
return false;

return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_HI_I8, MI, SextLoad, Dst);
}

return false;
}

return false;
}

bool AMDGPURegBankCombinerImpl::applyD16Load(
unsigned D16Opc, MachineInstr &DstMI, MachineInstr *SmallLoad,
Register SrcReg32ToOverwriteD16) const {
B.buildInstr(D16Opc, {DstMI.getOperand(0).getReg()},
{SmallLoad->getOperand(1).getReg(), SrcReg32ToOverwriteD16})
.setMemRefs(SmallLoad->memoperands());
DstMI.eraseFromParent();
return true;
}

SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
return MF.getInfo<SIMachineFunctionInfo>()->getMode();
}
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -4402,6 +4402,21 @@ def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction;
def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction;

class D16LoadGenericInstruction : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins ptype1:$addr);
let hasSideEffects = 0;
let mayLoad = 1;
}

def G_AMDGPU_LOAD_D16_LO : D16LoadGenericInstruction;
def G_AMDGPU_LOAD_D16_LO_U8 : D16LoadGenericInstruction;
def G_AMDGPU_LOAD_D16_LO_I8 : D16LoadGenericInstruction;
def G_AMDGPU_LOAD_D16_HI : D16LoadGenericInstruction;
def G_AMDGPU_LOAD_D16_HI_U8 : D16LoadGenericInstruction;
def G_AMDGPU_LOAD_D16_HI_I8 : D16LoadGenericInstruction;


class BufferStoreGenericInstruction : AMDGPUGenericInstruction {
let OutOperandList = (outs);
let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
Expand Down
15 changes: 8 additions & 7 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,11 @@ define <2 x i16> @atomic_load_flat_monotonic_i16_d16_hi_vector_insert(ptr %ptr,
; GFX9-LABEL: atomic_load_flat_monotonic_i16_d16_hi_vector_insert:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-NEXT: flat_load_ushort v3, v[0:1] glc
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: flat_load_short_d16_hi v2, v[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr %ptr monotonic, align 2
%insert = insertelement <2 x i16> %vec, i16 %load, i32 1
Expand Down Expand Up @@ -260,10 +260,11 @@ define <2 x i16> @atomic_load_flat_monotonic_i16_d16_lo_vector_insert(ptr %ptr,
; GFX9-LABEL: atomic_load_flat_monotonic_i16_d16_lo_vector_insert:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff0000
; GFX9-NEXT: flat_load_ushort v3, v[0:1] glc
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: flat_load_short_d16 v2, v[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr %ptr monotonic, align 2
%insert = insertelement <2 x i16> %vec, i16 %load, i32 0
Expand Down
15 changes: 8 additions & 7 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
Original file line number Diff line number Diff line change
Expand Up @@ -519,11 +519,11 @@ define <2 x i16> @atomic_load_global_monotonic_i16_d16_hi_vector_insert(ptr addr
; GFX9-LABEL: atomic_load_global_monotonic_i16_d16_hi_vector_insert:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-NEXT: global_load_ushort v3, v[0:1], off glc
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_short_d16_hi v2, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr addrspace(1) %ptr monotonic, align 2
%insert = insertelement <2 x i16> %vec, i16 %load, i32 1
Expand Down Expand Up @@ -622,10 +622,11 @@ define <2 x i16> @atomic_load_global_monotonic_i16_d16_lo_vector_insert(ptr addr
; GFX9-LABEL: atomic_load_global_monotonic_i16_d16_lo_vector_insert:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff0000
; GFX9-NEXT: global_load_ushort v3, v[0:1], off glc
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr addrspace(1) %ptr monotonic, align 2
%insert = insertelement <2 x i16> %vec, i16 %load, i32 0
Expand Down
13 changes: 6 additions & 7 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -400,11 +400,10 @@ define <2 x i16> @atomic_load_local_monotonic_i16_d16_hi_vector_insert(ptr addrs
; GFX9-LABEL: atomic_load_local_monotonic_i16_d16_hi_vector_insert:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_u16 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: ds_read_u16 v2, v0
; GFX9-NEXT: ds_read_u16_d16_hi v1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2
%insert = insertelement <2 x i16> %vec, i16 %load, i32 1
Expand Down Expand Up @@ -478,10 +477,10 @@ define <2 x i16> @atomic_load_local_monotonic_i16_d16_lo_vector_insert(ptr addrs
; GFX9-LABEL: atomic_load_local_monotonic_i16_d16_lo_vector_insert:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_u16 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000
; GFX9-NEXT: ds_read_u16 v2, v0
; GFX9-NEXT: ds_read_u16_d16 v1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0
; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2
%insert = insertelement <2 x i16> %vec, i16 %load, i32 0
Expand Down
Loading