Skip to content

Conversation

@petar-avramovic
Copy link
Collaborator

Add G_AMDGPU_LOAD_D16 generic instructions and GINodeEquivs for them,
this will import D16 load patterns to global-isel's tablegened
instruction selector.
For newly imported patterns to work add combines for G_AMDGPU_LOAD_D16
in AMDGPURegBankCombiner.

Copy link
Collaborator Author

petar-avramovic commented Aug 12, 2025

@llvmbot
Copy link
Member

llvmbot commented Aug 12, 2025

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Petar Avramovic (petar-avramovic)

Changes

Add G_AMDGPU_LOAD_D16 generic instructions and GINodeEquivs for them,
this will import D16 load patterns to global-isel's tablegened
instruction selector.
For newly imported patterns to work add combines for G_AMDGPU_LOAD_D16
in AMDGPURegBankCombiner.


Patch is 39.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153178.diff

6 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUCombine.td (+10-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUGISel.td (+7)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp (+105)
  • (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+15)
  • (added) llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll (+412)
  • (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+72-174)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 9587fad1ecd63..30e5362ba7adf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -71,6 +71,14 @@ def int_minmax_to_med3 : GICombineRule<
          [{ return matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
   (apply [{ applyMed3(*${min_or_max}, ${matchinfo}); }])>;
 
+def d16_matchdata : GIDefMatchData<"D16MatchInfo">;
+
+def d16_load : GICombineRule<
+  (defs root:$bitcast, d16_matchdata:$matchinfo),
+  (match (wip_match_opcode G_BITCAST):$bitcast,
+         [{ return matchD16Load(*${bitcast}, ${matchinfo}); }]),
+  (apply [{ applyD16Load(*${bitcast}, ${matchinfo}); }])>;
+
 def fp_minmax_to_med3 : GICombineRule<
   (defs root:$min_or_max, med3_matchdata:$matchinfo),
   (match (wip_match_opcode G_FMAXNUM,
@@ -198,5 +206,6 @@ def AMDGPURegBankCombiner : GICombiner<
    zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
    fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
    identity_combines, redundant_and, constant_fold_cast_op,
-   cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> {
+   cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines,
+   d16_load]> {
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 394a143dd3086..a4ccf368f7745 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -309,6 +309,13 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>;
 
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO, SIload_d16_lo>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO_U8, SIload_d16_lo_u8>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_LO_I8, SIload_d16_lo_i8>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI, SIload_d16_hi>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI_U8, SIload_d16_hi_u8>;
+def : GINodeEquiv<G_AMDGPU_LOAD_D16_HI_I8, SIload_d16_hi_i8>;
+
 def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
 // G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
 // so we don't mark it as equivalent.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index ee324a5e93f0f..946a3361aed29 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -74,6 +74,12 @@ class AMDGPURegBankCombinerImpl : public Combiner {
     Register Val0, Val1, Val2;
   };
 
+  struct D16MatchInfo {
+    unsigned Opc;
+    Register Dst;
+    MachineInstr *Load;
+  };
+
   MinMaxMedOpc getMinMaxPair(unsigned Opc) const;
 
   template <class m_Cst, typename CstTy>
@@ -89,6 +95,9 @@ class AMDGPURegBankCombinerImpl : public Combiner {
 
   void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const;
 
+  bool matchD16Load(MachineInstr &MI, D16MatchInfo &MatchInfo) const;
+  void applyD16Load(MachineInstr &MI, D16MatchInfo &MatchInfo) const;
+
 private:
   SIModeRegisterDefaults getMode() const;
   bool getIEEE() const;
@@ -392,6 +401,102 @@ void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt(
   MI.eraseFromParent();
 }
 
+bool AMDGPURegBankCombinerImpl::matchD16Load(MachineInstr &MI,
+                                             D16MatchInfo &MatchInfo) const {
+  if (!STI.d16PreservesUnusedBits())
+    return false;
+
+  Register Dst;
+  MachineInstr *Load, *SextLoad;
+  const int64_t CleanLo16 = 0xFFFFFFFFFFFF0000;
+  const int64_t CleanHi16 = 0x000000000000FFFF;
+
+  // Load lo
+  if (mi_match(MI.getOperand(1).getReg(), MRI,
+               m_GOr(m_GAnd(m_GBitcast(m_Reg(Dst)),
+                            m_Copy(m_SpecificICst(CleanLo16))),
+                     m_MInstr(Load)))) {
+
+    if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) {
+      const MachineMemOperand *MMO = *Load->memoperands_begin();
+      if (MMO->isAtomic())
+        return false;
+
+      unsigned LoadSize = MMO->getSizeInBits().getValue();
+      if (LoadSize == 8) {
+        MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_LO_U8, Dst, Load};
+      } else if (LoadSize == 16) {
+        MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_LO, Dst, Load};
+      } else
+        return false;
+      return true;
+    }
+
+    if (mi_match(
+            Load, MRI,
+            m_GAnd(m_MInstr(SextLoad), m_Copy(m_SpecificICst(CleanHi16))))) {
+      if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD)
+        return false;
+
+      const MachineMemOperand *MMO = *SextLoad->memoperands_begin();
+      if (MMO->isAtomic() || MMO->getSizeInBits().getValue() != 8)
+        return false;
+
+      MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_LO_I8, Dst, SextLoad};
+      return true;
+    }
+
+    return false;
+  }
+
+  // Load hi
+  if (mi_match(MI.getOperand(1).getReg(), MRI,
+               m_GOr(m_GAnd(m_GBitcast(m_Reg(Dst)),
+                            m_Copy(m_SpecificICst(CleanHi16))),
+                     m_GShl(m_MInstr(Load), m_Copy(m_SpecificICst(16)))))) {
+
+    if (Load->getOpcode() == AMDGPU::G_ZEXTLOAD) {
+      const MachineMemOperand *MMO = *Load->memoperands_begin();
+      if (MMO->isAtomic())
+        return false;
+
+      unsigned LoadSize = MMO->getSizeInBits().getValue();
+      if (LoadSize == 8) {
+        MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_HI_U8, Dst, Load};
+      } else if (LoadSize == 16) {
+        MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_HI, Dst, Load};
+      } else
+        return false;
+      return true;
+    }
+
+    if (mi_match(
+            Load, MRI,
+            m_GAnd(m_MInstr(SextLoad), m_Copy(m_SpecificICst(CleanHi16))))) {
+      if (SextLoad->getOpcode() != AMDGPU::G_SEXTLOAD)
+        return false;
+      const MachineMemOperand *MMO = *SextLoad->memoperands_begin();
+      if (MMO->isAtomic() || MMO->getSizeInBits().getValue() != 8)
+        return false;
+
+      MatchInfo = {AMDGPU::G_AMDGPU_LOAD_D16_HI_I8, Dst, SextLoad};
+      return true;
+    }
+
+    return false;
+  }
+
+  return false;
+}
+
+void AMDGPURegBankCombinerImpl::applyD16Load(MachineInstr &MI,
+                                             D16MatchInfo &MatchInfo) const {
+  B.buildInstr(MatchInfo.Opc, {MI.getOperand(0).getReg()},
+               {MatchInfo.Load->getOperand(1).getReg(), MatchInfo.Dst})
+      .addMemOperand(*MatchInfo.Load->memoperands_begin());
+  MI.eraseFromParent();
+}
+
 SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
   return MF.getInfo<SIMachineFunctionInfo>()->getMode();
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index bd5dfa92a8e43..ec761207351e1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -4206,6 +4206,21 @@ def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
 def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction;
 def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction;
 
+class D16LoadGenericInstruction : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins ptype1:$addr);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+}
+
+def G_AMDGPU_LOAD_D16_LO : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_LO_U8 : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_LO_I8 : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_HI : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_HI_U8 : D16LoadGenericInstruction;
+def G_AMDGPU_LOAD_D16_HI_I8 : D16LoadGenericInstruction;
+
+
 class BufferStoreGenericInstruction : AMDGPUGenericInstruction {
   let OutOperandList = (outs);
   let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll
new file mode 100644
index 0000000000000..62459247cc440
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll
@@ -0,0 +1,412 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200  -global-isel -new-reg-bank-select < %s | FileCheck --check-prefixes=GFX12 %s
+
+define amdgpu_ps void @load_P0_B16_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: load_P0_B16_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    flat_load_d16_b16 v0, v[1:2]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_store_b32 v[3:4], v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i16, ptr addrspace(0) %ptra
+  %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+  store <2 x i16> %res, ptr addrspace(0) %out
+  ret void
+}
+
+define amdgpu_ps void @load_P0_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: load_P0_B16_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    flat_load_d16_hi_b16 v0, v[1:2]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_store_b32 v[3:4], v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i16, ptr addrspace(0) %ptra
+  %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+  store <2 x i16> %res, ptr addrspace(0) %out
+  ret void
+}
+
+define amdgpu_ps void @sextload_P0_i8_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: sextload_P0_i8_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    flat_load_d16_i8 v0, v[1:2]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_store_b32 v[3:4], v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(0) %ptra
+  %a16 = sext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+  store <2 x i16> %res, ptr addrspace(0) %out
+  ret void
+}
+
+define amdgpu_ps void @sextload_P0_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: sextload_P0_i8_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    flat_load_d16_hi_i8 v0, v[1:2]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_store_b32 v[3:4], v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(0) %ptra
+  %a16 = sext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+  store <2 x i16> %res, ptr addrspace(0) %out
+  ret void
+}
+
+define amdgpu_ps void @zextload_P0_i8_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: zextload_P0_i8_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    flat_load_d16_u8 v0, v[1:2]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_store_b32 v[3:4], v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(0) %ptra
+  %a16 = zext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+  store <2 x i16> %res, ptr addrspace(0) %out
+  ret void
+}
+
+define amdgpu_ps void @zextload_P0_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
+; GFX12-LABEL: zextload_P0_i8_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    flat_load_d16_hi_u8 v0, v[1:2]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_store_b32 v[3:4], v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(0) %ptra
+  %a16 = zext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+  store <2 x i16> %res, ptr addrspace(0) %out
+  ret void
+}
+
+define amdgpu_ps void @load_P1_B16_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_P1_B16_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_b16 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i16, ptr addrspace(1) %ptra
+  %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @load_P1_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_P1_B16_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_hi_b16 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i16, ptr addrspace(1) %ptra
+  %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @sextload_P1_i8_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P1_i8_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_i8 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(1) %ptra
+  %a16 = sext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @sextload_P1_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P1_i8_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_hi_i8 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(1) %ptra
+  %a16 = sext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @zextload_P1_i8_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P1_i8_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_u8 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(1) %ptra
+  %a16 = zext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @zextload_P1_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P1_i8_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_hi_u8 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(1) %ptra
+  %a16 = zext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @load_P3_B16_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: load_P3_B16_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    ds_load_u16_d16 v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    ds_store_b32 v2, v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i16, ptr addrspace(3) %ptra
+  %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+  store <2 x i16> %res, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_ps void @load_P3_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: load_P3_B16_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    ds_load_u16_d16_hi v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    ds_store_b32 v2, v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i16, ptr addrspace(3) %ptra
+  %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+  store <2 x i16> %res, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_ps void @sextload_P3_i8_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: sextload_P3_i8_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    ds_load_i8_d16 v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    ds_store_b32 v2, v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(3) %ptra
+  %a16 = sext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+  store <2 x i16> %res, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_ps void @sextload_P3_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: sextload_P3_i8_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    ds_load_i8_d16_hi v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    ds_store_b32 v2, v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(3) %ptra
+  %a16 = sext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+  store <2 x i16> %res, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_ps void @zextload_P3_i8_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: zextload_P3_i8_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    ds_load_u8_d16 v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    ds_store_b32 v2, v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(3) %ptra
+  %a16 = zext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+  store <2 x i16> %res, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_ps void @zextload_P3_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
+; GFX12-LABEL: zextload_P3_i8_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    ds_load_u8_d16_hi v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    ds_store_b32 v2, v0
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(3) %ptra
+  %a16 = zext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+  store <2 x i16> %res, ptr addrspace(3) %out
+  ret void
+}
+
+define amdgpu_ps void @load_P4_B16_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_P4_B16_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_b16 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i16, ptr addrspace(4) %ptra
+  %res = insertelement <2 x i16> %vec, i16 %a, i32 0
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @load_P4_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: load_P4_B16_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_hi_b16 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i16, ptr addrspace(4) %ptra
+  %res = insertelement <2 x i16> %vec, i16 %a, i32 1
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @sextload_P4_i8_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P4_i8_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_i8 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(4) %ptra
+  %a16 = sext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @sextload_P4_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: sextload_P4_i8_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_hi_i8 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(4) %ptra
+  %a16 = sext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 1
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @zextload_P4_i8_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P4_i8_D16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_u8 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(4) %ptra
+  %a16 = zext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a16, i32 0
+  store <2 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @zextload_P4_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
+; GFX12-LABEL: zextload_P4_i8_D16_Hi:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_load_d16_hi_u8 v0, v[1:2], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_endpgm
+  %a = load i8, ptr addrspace(4) %ptra
+  %a16 = zext i8 %a to i16
+  %res = insertelement <2 x i16> %vec, i16 %a...
[truncated]

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We really need a way to attach predicates to the pattern in tablegen

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do have a way, it's the field called Predicates in the combine rule.

Comment on lines 422 to 423
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think atomic is an issue for d16, this doesn't change the memory properties only the destination register. If it is a problem, it's also a problem for volatile. So either this check is unnecessarily conservative or missing a check for volatile too

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what I was missing, removing isAtomic check made changes to a few files, looks correct.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

copyMemRefs. Also can this be done inline in tablegen

@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch from 2a2bdbf to 696b1b2 Compare August 12, 2025 13:05
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch from b7c4e4a to db26062 Compare August 12, 2025 13:06
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch from 696b1b2 to 788368e Compare August 12, 2025 13:16
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch from db26062 to 033a3b7 Compare August 12, 2025 13:16
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch from 788368e to 1464e56 Compare August 12, 2025 13:28
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch 2 times, most recently from a7fa75b to d3106eb Compare August 13, 2025 12:46
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch 2 times, most recently from d1da8f1 to d8c91e6 Compare August 13, 2025 13:16
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch from d3106eb to b0b1c30 Compare August 13, 2025 13:16
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch from d8c91e6 to e863c35 Compare August 13, 2025 13:51
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch from b0b1c30 to 34bde4a Compare August 13, 2025 13:52
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch from e863c35 to 87b9b39 Compare August 22, 2025 14:49
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch 2 times, most recently from 441b892 to 617ca54 Compare August 22, 2025 14:56
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch from 87b9b39 to ccc9546 Compare August 22, 2025 14:57
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do have a way, it's the field called Predicates in the combine rule.

@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch from 617ca54 to acde25e Compare August 26, 2025 12:17
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch from ccc9546 to 0a46b28 Compare August 26, 2025 12:17
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch from 0a46b28 to db74edc Compare September 4, 2025 09:45
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch from acde25e to 59fdffe Compare September 4, 2025 09:45
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/load-rules branch from db74edc to c9c95dd Compare September 10, 2025 16:21
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch from 59fdffe to 739caaa Compare September 10, 2025 16:21
Base automatically changed from users/petar-avramovic/load-rules to main September 11, 2025 09:26
Add G_AMDGPU_LOAD_D16 generic instructions and GINodeEquivs for them,
this will import D16 load patterns to global-isel's tablegened
instruction selector.
For newly imported patterns to work add combines for G_AMDGPU_LOAD_D16
in AMDGPURegBankCombiner.
@petar-avramovic petar-avramovic force-pushed the users/petar-avramovic/d16-loads branch from 739caaa to 2fbf8c8 Compare September 11, 2025 09:29
Copy link
Collaborator Author

petar-avramovic commented Sep 11, 2025

Merge activity

  • Sep 11, 10:00 AM UTC: A user started a stack merge that includes this pull request via Graphite.
  • Sep 11, 10:02 AM UTC: @petar-avramovic merged this pull request with Graphite.

@petar-avramovic petar-avramovic merged commit b970108 into main Sep 11, 2025
9 checks passed
@petar-avramovic petar-avramovic deleted the users/petar-avramovic/d16-loads branch September 11, 2025 10:02
@llvm-ci
Copy link
Collaborator

llvm-ci commented Sep 11, 2025

LLVM Buildbot has detected a new failure on builder llvm-clang-x86_64-expensive-checks-ubuntu running on as-builder-4 while building llvm at step 7 "test-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/187/builds/11235

Here is the relevant piece of the build log for the reference
Step 7 (test-check-all) failure: Test just built components: check-all completed (failure)
******************** TEST 'LLVM :: CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 2
/home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll | /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/FileCheck -check-prefixes=GCN,GFX6 /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
# executed command: /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti
# executed command: /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/FileCheck -check-prefixes=GCN,GFX6 /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
# RUN: at line 3
/home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll | /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/FileCheck -check-prefixes=GCN,GFX7 /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
# executed command: /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri
# executed command: /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/FileCheck -check-prefixes=GCN,GFX7 /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
# RUN: at line 4
/home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll | /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/FileCheck -check-prefixes=GCN,GFX8 /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
# executed command: /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=fiji
# executed command: /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/FileCheck -check-prefixes=GCN,GFX8 /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
# RUN: at line 5
/home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll | /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/FileCheck -check-prefixes=GCN,GFX9 /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll
# executed command: /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900
# .---command stderr------------
# | 
# | # After AMDGPURegBankCombiner
# | # Machine code for function atomic_load_global_monotonic_i16_d16_hi_vector_insert: IsSSA, TracksLiveness, Legalized, RegBankSelected
# | Function Live Ins: $sgpr4_sgpr5 in %2, $sgpr6_sgpr7 in %3, $sgpr8_sgpr9 in %4, $sgpr10_sgpr11 in %5, $sgpr12 in %6, $sgpr13 in %7, $sgpr14 in %8, $sgpr15 in %9
# | 
# | bb.1 (%ir-block.0):
# |   liveins: $vgpr0, $vgpr1, $vgpr2
# |   %10:vgpr(s32) = COPY $vgpr0
# |   %11:vgpr(s32) = COPY $vgpr1
# |   %0:vgpr(p1) = G_MERGE_VALUES %10:vgpr(s32), %11:vgpr(s32)
# |   %1:vgpr(<2 x s16>) = COPY $vgpr2
# |   %19:vgpr(s32) = G_ZEXTLOAD %0:vgpr(p1) :: (load monotonic (s16) from %ir.ptr, addrspace 1)
# |   %13:vgpr(<2 x s16>) = G_AMDGPU_LOAD_D16_HI %0:vgpr(p1), %1:vgpr(<2 x s16>) :: (load monotonic (s16) from %ir.ptr, addrspace 1)
# |   $vgpr0 = COPY %13:vgpr(<2 x s16>)
# |   SI_RETURN implicit $vgpr0
# | 
# | # End machine code for function atomic_load_global_monotonic_i16_d16_hi_vector_insert.
# | 
# | *** Bad machine code: Extra explicit operand on non-variadic instruction ***
# | - function:    atomic_load_global_monotonic_i16_d16_hi_vector_insert
# | - basic block: %bb.1  (0x591483d12918)
# | - instruction: %13:vgpr(<2 x s16>) = G_AMDGPU_LOAD_D16_HI %0:vgpr(p1), %1:vgpr(<2 x s16>) :: (load monotonic (s16) from %ir.ptr, addrspace 1)
# | - operand 2:   %1:vgpr
# | LLVM ERROR: Found 1 machine code errors.
# | PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace and instructions to reproduce the bug.
# | Stack dump:
# | 0.	Program arguments: /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900
# | 1.	Running pass 'CallGraph Pass Manager' on module '<stdin>'.
# | 2.	Running pass 'Verify generated machine code' on function '@atomic_load_global_monotonic_i16_d16_hi_vector_insert'
...

@petar-avramovic
Copy link
Collaborator Author

LLVM Buildbot has detected a new failure on builder llvm-clang-x86_64-expensive-checks-ubuntu running on as-builder-4 while building llvm at step 7 "test-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/187/builds/11235
Here is the relevant piece of the build log for the reference

#158039

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants