Skip to content

Conversation

@jrbyrnes
Copy link
Contributor

@jrbyrnes jrbyrnes commented Mar 6, 2025

For code maintainability -- this may result in cases where we are applying the optimization where it is not profitable, but those are likely to be rare.

jrbyrnes added 2 commits March 6, 2025 09:14
Change-Id: I0459afe86aa0cec6ecc9c0d4ffafbf8d98bffd65
Change-Id: Id6dddaf5b1526617afaba89d18c0a54c60931469
@llvmbot
Copy link
Member

llvmbot commented Mar 6, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Jeffrey Byrnes (jrbyrnes)

Changes

For code maintainability -- this may result in cases where we are applying the optimization where it is not profitable, but those are likely to be rare.


Patch is 71.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130150.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp (+1-85)
  • (modified) llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll (+706-540)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 86af897943dae..9ae043048b932 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -127,92 +127,8 @@ class LiveRegOptimizer {
     return LK.first != TargetLoweringBase::TypeLegal;
   }
 
-  /// Check if intrinsic natively operates on 8-bit or 16-bit
-  bool isNativeIntrinsic(Intrinsic::ID ID) {
-    switch (ID) {
-    case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
-    case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
-    case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
-    case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
-    case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
-    case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
-    case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
-    case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
-    case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
-    case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
-    case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
-    case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
-    case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
-    case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
-    case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
-    case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
-    case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
-    case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
-    case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
-    case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
-    case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
-    case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
-    case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
-    case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
-    case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
-    case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
-    case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
-    case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
-    case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
-    case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
-    case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
-    case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
-    case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
-    case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
-    case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
-    case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
-    case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
-    case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
-    case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
-    case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
-    case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
-    case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
-    case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
-    case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
-    case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
-    case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
-    case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
-    case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
-    case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
-    case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
-    case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
-    case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
-    case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
-    case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
-    case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
-    case Intrinsic::amdgcn_raw_buffer_store_format:
-    case Intrinsic::amdgcn_raw_buffer_store:
-    case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
-    case Intrinsic::amdgcn_raw_ptr_buffer_store:
-    case Intrinsic::amdgcn_struct_buffer_store_format:
-    case Intrinsic::amdgcn_struct_buffer_store:
-    case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
-    case Intrinsic::amdgcn_struct_ptr_buffer_store:
-    case Intrinsic::amdgcn_raw_tbuffer_store:
-    case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
-    case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
-    case Intrinsic::amdgcn_struct_tbuffer_store:
-      return true;
-    default:
-      return false;
-    }
-  }
-
   bool isOpLegal(Instruction *I) {
-    if (const auto *Intr = dyn_cast<IntrinsicInst>(I)) {
-      Intrinsic::ID ID = Intr->getIntrinsicID();
-      if (isNativeIntrinsic(ID))
-        return true;
-    }
-    // Stores
-    if (isa<StoreInst>(I))
-      return true;
-    return false;
+    return isa<StoreInst>(I) || isa<IntrinsicInst>(I);
   }
 
   bool isCoercionProfitable(Instruction *II) {
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index ee6a63fc1f7e1..a401f989a2507 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -1,37 +1,40 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefix=GFX942 %s
 
 define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v3i8_liveout:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT:    v_mov_b32_e32 v3, 8
-; GFX906-NEXT:    s_mov_b32 s4, 0xff0000
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v4, v2, s[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_sdwa v5, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX906-NEXT:    v_and_or_b32 v4, v4, s4, v5
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB0_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v0, v2, s[2:3]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_sdwa v2, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX906-NEXT:    v_and_or_b32 v4, v0, s4, v2
-; GFX906-NEXT:  .LBB0_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    global_store_byte_d16_hi v1, v4, s[6:7] offset:2
-; GFX906-NEXT:    global_store_short v1, v4, s[6:7]
-; GFX906-NEXT:    s_endpgm
+; GFX942-LABEL: v3i8_liveout:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v1, 2, v4
+; GFX942-NEXT:    v_mov_b32_e32 v2, 8
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v3, v1, s[0:1]
+; GFX942-NEXT:    s_mov_b32 s4, 0xff0000
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v5, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_or_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX942-NEXT:    v_and_or_b32 v3, v3, s4, v5
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB0_2
+; GFX942-NEXT:  ; %bb.1: ; %bb.1
+; GFX942-NEXT:    global_load_dword v1, v1, s[2:3]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX942-NEXT:    v_and_or_b32 v3, v1, s4, v2
+; GFX942-NEXT:  .LBB0_2: ; %bb.2
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    global_store_byte_d16_hi v0, v3, s[6:7] offset:2
+; GFX942-NEXT:    global_store_short v0, v3, s[6:7]
+; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
@@ -50,24 +53,25 @@ bb.2:
 }
 
 define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v4i8_liveout:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v2, v3, s[0:1]
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB1_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dword v2, v3, s[2:3]
-; GFX906-NEXT:  .LBB1_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dword v1, v2, s[6:7]
-; GFX906-NEXT:    s_endpgm
+; GFX942-LABEL: v4i8_liveout:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 2, v3
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v2, s[0:1]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v3
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB1_2
+; GFX942-NEXT:  ; %bb.1: ; %bb.1
+; GFX942-NEXT:    global_load_dword v1, v2, s[2:3]
+; GFX942-NEXT:  .LBB1_2: ; %bb.2
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
@@ -86,28 +90,29 @@ bb.2:
 }
 
 define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v5i8_liveout:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; GFX906-NEXT:    v_mov_b32_e32 v3, 0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[0:1]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB2_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[2:3]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX906-NEXT:  .LBB2_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    global_store_byte v3, v2, s[6:7] offset:4
-; GFX906-NEXT:    global_store_dword v3, v1, s[6:7]
-; GFX906-NEXT:    s_endpgm
+; GFX942-LABEL: v5i8_liveout:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 3, v4
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[0:1]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB2_2
+; GFX942-NEXT:  ; %bb.1: ; %bb.1
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[2:3]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX942-NEXT:  .LBB2_2: ; %bb.2
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    global_store_byte v2, v1, s[6:7] offset:4
+; GFX942-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
@@ -126,24 +131,25 @@ bb.2:
 }
 
 define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v8i8_liveout:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; GFX906-NEXT:    v_mov_b32_e32 v3, 0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[0:1]
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB3_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[2:3]
-; GFX906-NEXT:  .LBB3_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx2 v3, v[1:2], s[6:7]
-; GFX906-NEXT:    s_endpgm
+; GFX942-LABEL: v8i8_liveout:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v3, 3, v4
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[0:1]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB3_2
+; GFX942-NEXT:  ; %bb.1: ; %bb.1
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v3, s[2:3]
+; GFX942-NEXT:  .LBB3_2: ; %bb.2
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
@@ -162,24 +168,25 @@ bb.2:
 }
 
 define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v16i8_liveout:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
-; GFX906-NEXT:    v_mov_b32_e32 v5, 0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v6, s[0:1]
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB4_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v6, s[2:3]
-; GFX906-NEXT:  .LBB4_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    global_store_dwordx4 v5, v[1:4], s[6:7]
-; GFX906-NEXT:    s_endpgm
+; GFX942-LABEL: v16i8_liveout:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_and_b32_e32 v6, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v5, 4, v6
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v5, s[0:1]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v6
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB4_2
+; GFX942-NEXT:  ; %bb.1: ; %bb.1
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3]
+; GFX942-NEXT:  .LBB4_2: ; %bb.2
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx
@@ -198,28 +205,29 @@ bb.2:
 }
 
 define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v32i8_liveout:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 5, v0
-; GFX906-NEXT:    v_mov_b32_e32 v9, 0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v10, s[0:1] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v10, s[0:1]
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB5_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v10, s[2:3] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[1:4], v10, s[2:3]
-; GFX906-NEXT:  .LBB5_2: ; %bb.2
-; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    global_store_dwordx4 v9, v[5:8], s[6:7] offset:16
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    global_store_dwordx4 v9, v[1:4], s[6:7]
-; GFX906-NEXT:    s_endpgm
+; GFX942-LABEL: v32i8_liveout:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_and_b32_e32 v10, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v9, 5, v10
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx4 v[4:7], v9, s[0:1] offset:16
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v9, s[0:1]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v10
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB5_2
+; GFX942-NEXT:  ; %bb.1: ; %bb.1
+; GFX942-NEXT:    global_load_dwordx4 v[4:7], v9, s[2:3] offset:16
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v9, s[2:3]
+; GFX942-NEXT:  .LBB5_2: ; %bb.2
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7] offset:16
+; GFX942-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
@@ -238,101 +246,77 @@ bb.2:
 }
 
 define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
-; GFX906-LABEL: v256i8_liveout:
-; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX906-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v61, 3, v0
-; GFX906-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX906-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v61, s[0:1] offset:240
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v61, s[0:1] offset:224
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v61, s[0:1] offset:208
-; GFX906-NEXT:    global_load_dwordx4 v[17:20], v61, s[0:1] offset:192
-; GFX906-NEXT:    s_mov_b32 s14, -1
-; GFX906-NEXT:    s_mov_b32 s15, 0xe00000
-; GFX906-NEXT:    s_add_u32 s12, s12, s11
-; GFX906-NEXT:    s_addc_u32 s13, s13, 0
-; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT:    v_mov_b32_e32 v4, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v17, off, s[12:15], 0 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_store_dword v18, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dwor...
[truncated]

Copy link
Contributor

@choikwa choikwa left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@jrbyrnes jrbyrnes merged commit bf12954 into llvm:main Mar 6, 2025
13 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants