Skip to content

Commit b970108

Browse files
AMDGPU/GlobalISel: Import D16 load patterns and add combines for them (#153178)
Add G_AMDGPU_LOAD_D16 generic instructions and GINodeEquivs for them, this will import D16 load patterns to global-isel's tablegened instruction selector. For newly imported patterns to work add combines for G_AMDGPU_LOAD_D16 in AMDGPURegBankCombiner.
1 parent 861dc29 commit b970108

File tree

9 files changed

+622
-196
lines changed

9 files changed

+622
-196
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCombine.td

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,12 @@ def int_minmax_to_med3 : GICombineRule<
7171
[{ return matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
7272
(apply [{ applyMed3(*${min_or_max}, ${matchinfo}); }])>;
7373

74+
let Predicates = [Predicate<"Subtarget->d16PreservesUnusedBits()">] in
75+
def d16_load : GICombineRule<
76+
(defs root:$bitcast),
77+
(combine (G_BITCAST $dst, $src):$bitcast,
78+
[{ return combineD16Load(*${bitcast} ); }])>;
79+
7480
def fp_minmax_to_med3 : GICombineRule<
7581
(defs root:$min_or_max, med3_matchdata:$matchinfo),
7682
(match (wip_match_opcode G_FMAXNUM,
@@ -219,5 +225,6 @@ def AMDGPURegBankCombiner : GICombiner<
219225
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
220226
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
221227
identity_combines, redundant_and, constant_fold_cast_op,
222-
cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> {
228+
cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
229+
d16_load]> {
223230
}

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,13 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
315315
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
316316
def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>;
317317

318+
def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO, SIload_d16_lo>;
319+
def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO_U8, SIload_d16_lo_u8>;
320+
def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO_I8, SIload_d16_lo_i8>;
321+
def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI, SIload_d16_hi>;
322+
def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI_U8, SIload_d16_hi_u8>;
323+
def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI_I8, SIload_d16_hi_i8>;
324+
318325
def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
319326
// G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
320327
// so we don't mark it as equivalent.

llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,10 @@ class AMDGPURegBankCombinerImpl : public Combiner {
8989

9090
void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const;
9191

92+
bool combineD16Load(MachineInstr &MI) const;
93+
bool applyD16Load(unsigned D16Opc, MachineInstr &DstMI,
94+
MachineInstr *SmallLoad, Register ToOverwriteD16) const;
95+
9296
private:
9397
SIModeRegisterDefaults getMode() const;
9498
bool getIEEE() const;
@@ -392,6 +396,88 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
392396
MI.eraseFromParent();
393397
}
394398

399+
bool AMDGPURegBankCombinerImpl::combineD16Load(MachineInstr &MI) const {
400+
Register Dst;
401+
MachineInstr *Load, *SextLoad;
402+
const int64_t CleanLo16 = 0xFFFFFFFFFFFF0000;
403+
const int64_t CleanHi16 = 0x000000000000FFFF;
404+
405+
// Load lo
406+
if (mi_match(MI.getOperand(1).getReg(), MRI,
407+
m_GOr(m_GAnd(m_GBitcast(m_Reg(Dst)),
408+
m_Copy(m_SpecificICst(CleanLo16))),
409+
m_MInstr(Load)))) {
410+
411+
if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) {
412+
const MachineMemOperand *MMO = *Load->memoperands_begin();
413+
unsigned LoadSize = MMO->getSizeInBits().getValue();
414+
if (LoadSize == 8)
415+
return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_LO_U8, MI, Load, Dst);
416+
if (LoadSize == 16)
417+
return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_LO, MI, Load, Dst);
418+
return false;
419+
}
420+
421+
if (mi_match(
422+
Load, MRI,
423+
m_GAnd(m_MInstr(SextLoad), m_Copy(m_SpecificICst(CleanHi16))))) {
424+
if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD)
425+
return false;
426+
427+
const MachineMemOperand *MMO = *SextLoad->memoperands_begin();
428+
if (MMO->getSizeInBits().getValue() != 8)
429+
return false;
430+
431+
return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_LO_I8, MI, SextLoad, Dst);
432+
}
433+
434+
return false;
435+
}
436+
437+
// Load hi
438+
if (mi_match(MI.getOperand(1).getReg(), MRI,
439+
m_GOr(m_GAnd(m_GBitcast(m_Reg(Dst)),
440+
m_Copy(m_SpecificICst(CleanHi16))),
441+
m_GShl(m_MInstr(Load), m_Copy(m_SpecificICst(16)))))) {
442+
443+
if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) {
444+
const MachineMemOperand *MMO = *Load->memoperands_begin();
445+
unsigned LoadSize = MMO->getSizeInBits().getValue();
446+
if (LoadSize == 8)
447+
return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_HI_U8, MI, Load, Dst);
448+
if (LoadSize == 16)
449+
return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_HI, MI, Load, Dst);
450+
return false;
451+
}
452+
453+
if (mi_match(
454+
Load, MRI,
455+
m_GAnd(m_MInstr(SextLoad), m_Copy(m_SpecificICst(CleanHi16))))) {
456+
if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD)
457+
return false;
458+
const MachineMemOperand *MMO = *SextLoad->memoperands_begin();
459+
if (MMO->getSizeInBits().getValue() != 8)
460+
return false;
461+
462+
return applyD16Load(AMDGPU::G_AMDGPU_LOAD_D16_HI_I8, MI, SextLoad, Dst);
463+
}
464+
465+
return false;
466+
}
467+
468+
return false;
469+
}
470+
471+
bool AMDGPURegBankCombinerImpl::applyD16Load(
472+
unsigned D16Opc, MachineInstr &DstMI, MachineInstr *SmallLoad,
473+
Register SrcReg32ToOverwriteD16) const {
474+
B.buildInstr(D16Opc, {DstMI.getOperand(0).getReg()},
475+
{SmallLoad->getOperand(1).getReg(), SrcReg32ToOverwriteD16})
476+
.setMemRefs(SmallLoad->memoperands());
477+
DstMI.eraseFromParent();
478+
return true;
479+
}
480+
395481
SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
396482
return MF.getInfo<SIMachineFunctionInfo>()->getMode();
397483
}

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4402,6 +4402,21 @@ def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
44024402
def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction;
44034403
def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction;
44044404

4405+
class D16LoadGenericInstruction : AMDGPUGenericInstruction {
4406+
let OutOperandList = (outs type0:$dst);
4407+
let InOperandList = (ins ptype1:$addr);
4408+
let hasSideEffects = 0;
4409+
let mayLoad = 1;
4410+
}
4411+
4412+
def G_AMDGPU_LOAD_D16_LO : D16LoadGenericInstruction;
4413+
def G_AMDGPU_LOAD_D16_LO_U8 : D16LoadGenericInstruction;
4414+
def G_AMDGPU_LOAD_D16_LO_I8 : D16LoadGenericInstruction;
4415+
def G_AMDGPU_LOAD_D16_HI : D16LoadGenericInstruction;
4416+
def G_AMDGPU_LOAD_D16_HI_U8 : D16LoadGenericInstruction;
4417+
def G_AMDGPU_LOAD_D16_HI_I8 : D16LoadGenericInstruction;
4418+
4419+
44054420
class BufferStoreGenericInstruction : AMDGPUGenericInstruction {
44064421
let OutOperandList = (outs);
44074422
let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,

llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -186,11 +186,11 @@ define <2 x i16> @atomic_load_flat_monotonic_i16_d16_hi_vector_insert(ptr %ptr,
186186
; GFX9-LABEL: atomic_load_flat_monotonic_i16_d16_hi_vector_insert:
187187
; GFX9: ; %bb.0:
188188
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189-
; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc
190-
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
189+
; GFX9-NEXT: flat_load_ushort v3, v[0:1] glc
190+
; GFX9-NEXT: s_nop 0
191+
; GFX9-NEXT: flat_load_short_d16_hi v2, v[0:1] glc
191192
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
192-
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
193-
; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
193+
; GFX9-NEXT: v_mov_b32_e32 v0, v2
194194
; GFX9-NEXT: s_setpc_b64 s[30:31]
195195
%load = load atomic i16, ptr %ptr monotonic, align 2
196196
%insert = insertelement <2 x i16> %vec, i16 %load, i32 1
@@ -260,10 +260,11 @@ define <2 x i16> @atomic_load_flat_monotonic_i16_d16_lo_vector_insert(ptr %ptr,
260260
; GFX9-LABEL: atomic_load_flat_monotonic_i16_d16_lo_vector_insert:
261261
; GFX9: ; %bb.0:
262262
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
263-
; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc
264-
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff0000
263+
; GFX9-NEXT: flat_load_ushort v3, v[0:1] glc
264+
; GFX9-NEXT: s_nop 0
265+
; GFX9-NEXT: flat_load_short_d16 v2, v[0:1] glc
265266
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
266-
; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
267+
; GFX9-NEXT: v_mov_b32_e32 v0, v2
267268
; GFX9-NEXT: s_setpc_b64 s[30:31]
268269
%load = load atomic i16, ptr %ptr monotonic, align 2
269270
%insert = insertelement <2 x i16> %vec, i16 %load, i32 0

llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -519,11 +519,11 @@ define <2 x i16> @atomic_load_global_monotonic_i16_d16_hi_vector_insert(ptr addr
519519
; GFX9-LABEL: atomic_load_global_monotonic_i16_d16_hi_vector_insert:
520520
; GFX9: ; %bb.0:
521521
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
522-
; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
523-
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
522+
; GFX9-NEXT: global_load_ushort v3, v[0:1], off glc
523+
; GFX9-NEXT: s_nop 0
524+
; GFX9-NEXT: global_load_short_d16_hi v2, v[0:1], off glc
524525
; GFX9-NEXT: s_waitcnt vmcnt(0)
525-
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
526-
; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
526+
; GFX9-NEXT: v_mov_b32_e32 v0, v2
527527
; GFX9-NEXT: s_setpc_b64 s[30:31]
528528
%load = load atomic i16, ptr addrspace(1) %ptr monotonic, align 2
529529
%insert = insertelement <2 x i16> %vec, i16 %load, i32 1
@@ -622,10 +622,11 @@ define <2 x i16> @atomic_load_global_monotonic_i16_d16_lo_vector_insert(ptr addr
622622
; GFX9-LABEL: atomic_load_global_monotonic_i16_d16_lo_vector_insert:
623623
; GFX9: ; %bb.0:
624624
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
625-
; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
626-
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff0000
625+
; GFX9-NEXT: global_load_ushort v3, v[0:1], off glc
626+
; GFX9-NEXT: s_nop 0
627+
; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off glc
627628
; GFX9-NEXT: s_waitcnt vmcnt(0)
628-
; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
629+
; GFX9-NEXT: v_mov_b32_e32 v0, v2
629630
; GFX9-NEXT: s_setpc_b64 s[30:31]
630631
%load = load atomic i16, ptr addrspace(1) %ptr monotonic, align 2
631632
%insert = insertelement <2 x i16> %vec, i16 %load, i32 0

llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -400,11 +400,10 @@ define <2 x i16> @atomic_load_local_monotonic_i16_d16_hi_vector_insert(ptr addrs
400400
; GFX9-LABEL: atomic_load_local_monotonic_i16_d16_hi_vector_insert:
401401
; GFX9: ; %bb.0:
402402
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
403-
; GFX9-NEXT: ds_read_u16 v0, v0
404-
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
403+
; GFX9-NEXT: ds_read_u16 v2, v0
404+
; GFX9-NEXT: ds_read_u16_d16_hi v1, v0
405405
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
406-
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
407-
; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0
406+
; GFX9-NEXT: v_mov_b32_e32 v0, v1
408407
; GFX9-NEXT: s_setpc_b64 s[30:31]
409408
%load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2
410409
%insert = insertelement <2 x i16> %vec, i16 %load, i32 1
@@ -478,10 +477,10 @@ define <2 x i16> @atomic_load_local_monotonic_i16_d16_lo_vector_insert(ptr addrs
478477
; GFX9-LABEL: atomic_load_local_monotonic_i16_d16_lo_vector_insert:
479478
; GFX9: ; %bb.0:
480479
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
481-
; GFX9-NEXT: ds_read_u16 v0, v0
482-
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff0000
480+
; GFX9-NEXT: ds_read_u16 v2, v0
481+
; GFX9-NEXT: ds_read_u16_d16 v1, v0
483482
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
484-
; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0
483+
; GFX9-NEXT: v_mov_b32_e32 v0, v1
485484
; GFX9-NEXT: s_setpc_b64 s[30:31]
486485
%load = load atomic i16, ptr addrspace(3) %ptr monotonic, align 2
487486
%insert = insertelement <2 x i16> %vec, i16 %load, i32 0

0 commit comments

Comments
 (0)