Skip to content

Conversation

@arsenm
Copy link
Contributor

@arsenm arsenm commented Jan 9, 2025

We already custom lower the other 16-bit element type shuffles.

arsenm added 2 commits January 9, 2025 17:57
We already custom lower the other 16-bit element type shuffles.
Copy link
Contributor Author

arsenm commented Jan 9, 2025

This stack of pull requests is managed by Graphite. Learn more about stacking.

@arsenm arsenm changed the title AMDGPU: Add gfx940 run line to shuffle vector test AMDGPU: Custom lower bf16 shuffles Jan 9, 2025
@llvmbot
Copy link
Member

llvmbot commented Jan 9, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

We already custom lower the other 16-bit element type shuffles.


Patch is 142.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122252.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+5-5)
  • (modified) llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll (+1696-906)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0ac84f4e1f02af..992f7ed99d3bb7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -784,8 +784,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                        {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
 
     setOperationAction(ISD::VECTOR_SHUFFLE,
-                       {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
-                        MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
+                       {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
+                        MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
+                        MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
                        Custom);
 
     for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
@@ -7545,9 +7546,8 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
   SDLoc SL(Op);
   EVT ResultVT = Op.getValueType();
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
-
-  EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
-  EVT EltVT = PackVT.getVectorElementType();
+  MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
+  MVT PackVT = MVT::getVectorVT(EltVT, 2);
   int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
 
   // vector_shuffle <0,1,6,7> lhs, rhs
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index e408e83da1c298..e7ae9d831424cc 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX940 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
 
@@ -31,16 +32,27 @@ define <4 x half> @shuffle_v4f16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_234u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_234u:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GX900-NEXT:    s_waitcnt vmcnt(1)
+; GX900-NEXT:    v_mov_b32_e32 v0, v6
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v1, v4
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_234u:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, v6
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_234u:
 ; GFX10:       ; %bb.0:
@@ -94,13 +106,22 @@ define <4 x half> @shuffle_v4f16_u1u3(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_u3u1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_u3u1:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v0, v2
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_u3u1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    v_mov_b32_e32 v1, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_u3u1:
 ; GFX10:       ; %bb.0:
@@ -151,16 +172,27 @@ define <4 x half> @shuffle_v4f16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_3u6u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v0, s4, v5, 16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_3u6u:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GX900-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GX900-NEXT:    s_waitcnt vmcnt(1)
+; GX900-NEXT:    v_alignbit_b32 v0, s4, v5, 16
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v1, v4
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_3u6u:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX940-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_alignbit_b32 v0, s0, v5, 16
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_3u6u:
 ; GFX10:       ; %bb.0:
@@ -189,16 +221,27 @@ define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_3uu7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v0, s4, v5, 16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_3uu7:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GX900-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GX900-NEXT:    s_waitcnt vmcnt(1)
+; GX900-NEXT:    v_alignbit_b32 v0, s4, v5, 16
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v1, v4
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_3uu7:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX940-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_alignbit_b32 v0, s0, v5, 16
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_3uu7:
 ; GFX10:       ; %bb.0:
@@ -227,16 +270,27 @@ define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_35u5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v4, v[2:3], off
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_35u5:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GX900-NEXT:    global_load_dword v4, v[2:3], off
+; GX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GX900-NEXT:    v_mov_b32_e32 v1, v4
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_35u5:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX940-NEXT:    global_load_dword v4, v[2:3], off
+; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_perm_b32 v0, v4, v5, s0
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_35u5:
 ; GFX10:       ; %bb.0:
@@ -263,17 +317,29 @@ define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_357u:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v1, s4, v5, 16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v4, v6, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_357u:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GX900-NEXT:    s_waitcnt vmcnt(1)
+; GX900-NEXT:    v_alignbit_b32 v1, s4, v5, 16
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_perm_b32 v0, v4, v6, s4
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_357u:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX940-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_alignbit_b32 v1, s0, v5, 16
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_perm_b32 v0, v4, v6, s0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_357u:
 ; GFX10:       ; %bb.0:
@@ -432,13 +498,22 @@ define <4 x half> @shuffle_v4f16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_2301(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_2301:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_2301:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v0, v2
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_2301:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    v_mov_b32_e32 v1, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_2301:
 ; GFX10:       ; %bb.0:
@@ -773,13 +848,22 @@ define <4 x half> @shuffle_v4f16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_6745(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_6745:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_6745:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v0, v2
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_6745:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    v_mov_b32_e32 v1, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_6745:
 ; GFX10:       ; %bb.0:
@@ -833,16 +917,27 @@ define <4 x half> @shuffle_v4f16_6767(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_2356:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v1, v6, v5, 16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_2356:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GX900-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GX900-NEXT:    s_waitcnt vmcnt(1)
+; GX900-NEXT:    v_alignbit_b32 v1, v6, v5, 16
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v0, v4
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_2356:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_alignbit_b32 v1, v7, v6, 16
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_2356:
 ; GFX10:       ; %bb.0:
@@ -871,16 +966,27 @@ define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_5623:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v0, v6, v5, 16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_5623:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GX900-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GX900-NEXT:    s_waitcnt vmcnt(1)
+; GX900-NEXT:    v_alignbit_b32 v0, v6, v5, 16
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v1, v4
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_5623:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_alignbit_b32 v0, v7, v6, 16
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_5623:
 ; GFX10:       ; %bb.0:
@@ -987,17 +1093,29 @@ define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4f16_5734:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4f16_5734:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GX900-NEXT:    s_waitcnt vmcnt(1)
+; GX900-NEXT:    v_perm_b32 v0, v5, v4, s4
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4f16_5734:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX940-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_perm_b32 v0, v5, v4, s0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_5734:
 ; GFX10:       ; %bb.0:
@@ -1027,16 +1145,27 @@ define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %
 }
 
 define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
-; GFX9-LABEL: shuffle_v4i16_2356:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
-; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_alignbit_b32 v1, v6, v5, 16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GX900-LABEL: shuffle_v4i16_2356:
+; GX900:       ; %bb.0:
+; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GX900-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GX900-NEXT:    s_waitcnt vmcnt(1)
+; GX900-NEXT:    v_alignbit_b32 v1, v6, v5, 16
+; GX900-NEXT:    s_waitcnt vmcnt(0)
+; GX900-NEXT:    v_mov_b32_e32 v0, v4
+; GX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: shuffle_v4i16_2356:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX940-NEXT:    s_waitcnt vmcnt(1)
+; GFX940-NEXT:    v_alignbit_b32 v1, v7, v6, 16
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4i16_2356:
 ; GFX10:       ; %bb.0:
@@ -1101,15 +1230,25 @@ define <4 x i16> @shuffle_v4i16_0167(ptr addrsp...
[truncated]

@arsenm arsenm marked this pull request as ready for review January 9, 2025 11:03
@arsenm arsenm merged commit d2b78c6 into main Jan 9, 2025
12 checks passed
@arsenm arsenm deleted the users/arsenm/amdgpu/custom-lower-bf16-shuffles branch January 9, 2025 14:37
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants