diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 6500fcee34061..7503cb49b06a0 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1452,6 +1452,66 @@ let WaveSizePredicate = isWave64 in { } +// GFX11 RDNA3 WMMA patterns for bare intrinsic calls (no explicit modifiers) +// Match intrinsics directly and provide zero modifiers to the instruction +// High AddedComplexity ensures these beat the broken WMMARegularPat patterns + +// Wave32 patterns (RDNA3 native wave size) +let SubtargetPredicate = isGFX11Only, WaveSizePredicate = isWave32 in { + + // FP16 WMMA: <8 x float> = wmma(<16 x half>, <16 x half>, <8 x float>) + def : GCNPat < + (v8f32 (int_amdgcn_wmma_f32_16x16x16_f16 v16f16:$a, v16f16:$b, v8f32:$c)), + (v8f32 (V_WMMA_F32_16X16X16_F16_twoaddr_w32 (i32 0), v16f16:$a, (i32 0), v16f16:$b, (i32 0), v8f32:$c)) + > { + let AddedComplexity = 10000; + } + + // BF16 WMMA: <8 x float> = wmma(<16 x i16>, <16 x i16>, <8 x float>) + def : GCNPat < + (v8f32 (int_amdgcn_wmma_f32_16x16x16_bf16 v16i16:$a, v16i16:$b, v8f32:$c)), + (v8f32 (V_WMMA_F32_16X16X16_BF16_twoaddr_w32 (i32 0), v16i16:$a, (i32 0), v16i16:$b, (i32 0), v8f32:$c)) + > { + let AddedComplexity = 10000; + } + + // INT8 WMMA: <8 x i32> = wmma(i1, <4 x i32>, i1, <4 x i32>, <8 x i32>, i1) + def : GCNPat < + (v8i32 (int_amdgcn_wmma_i32_16x16x16_iu8 i1:$a_neg, v4i32:$a, i1:$b_neg, v4i32:$b, v8i32:$c, i1:$clamp)), + (v8i32 (V_WMMA_I32_16X16X16_IU8_twoaddr_w32 (VOP3PModsNeg $a_neg), v4i32:$a, (VOP3PModsNeg $b_neg), v4i32:$b, (i32 8), v8i32:$c, i1:$clamp)) + > { + let AddedComplexity = 10000; + } + + // INT4 WMMA: <8 x i32> = wmma(i1, <2 x i32>, i1, <2 x i32>, <8 x i32>, i1) + def : GCNPat < + (v8i32 (int_amdgcn_wmma_i32_16x16x16_iu4 i1:$a_neg, v2i32:$a, i1:$b_neg, v2i32:$b, v8i32:$c, i1:$clamp)), + (v8i32 (V_WMMA_I32_16X16X16_IU4_twoaddr_w32 (VOP3PModsNeg $a_neg), v2i32:$a, (VOP3PModsNeg $b_neg), v2i32:$b, (i32 8), v8i32:$c, i1:$clamp)) + > { + let AddedComplexity = 10000; + } +} + +// Wave64 patterns (compatibility mode) +let SubtargetPredicate = isGFX11Only, WaveSizePredicate = isWave64 in { + + // FP16 WMMA Wave64: <4 x float> = wmma(<16 x half>, <16 x half>, <4 x float>) + def : GCNPat < + (v4f32 (int_amdgcn_wmma_f32_16x16x16_f16 v16f16:$a, v16f16:$b, v4f32:$c)), + (v4f32 (V_WMMA_F32_16X16X16_F16_twoaddr_w64 (i32 0), v16f16:$a, (i32 0), v16f16:$b, (i32 0), v4f32:$c)) + > { + let AddedComplexity = 10000; + } + + // BF16 WMMA Wave64: <4 x float> = wmma(<16 x i16>, <16 x i16>, <4 x float>) + def : GCNPat < + (v4f32 (int_amdgcn_wmma_f32_16x16x16_bf16 v16i16:$a, v16i16:$b, v4f32:$c)), + (v4f32 (V_WMMA_F32_16X16X16_BF16_twoaddr_w64 (i32 0), v16i16:$a, (i32 0), v16i16:$b, (i32 0), v4f32:$c)) + > { + let AddedComplexity = 10000; + } +} + class VOP3PWMMA_Profile ArgTy, bit _IsSWMMAC, int _IndexType, bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0, bit _HasMatrixFMT = 0, bit _HasMatrixScale = 0, diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx11-kernel-w32.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx11-kernel-w32.ll new file mode 100644 index 0000000000000..b2737ccd2da4d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx11-kernel-w32.ll @@ -0,0 +1,172 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX11-W32 + +; Test GFX11 WMMA with amdgpu_kernel (compute) calling convention +; This test is critical to prevent regression of compute kernel WMMA support + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x float>) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1, <4 x i32>, i1, <4 x i32>, <8 x i32>, i1) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1, <2 x i32>, i1, <2 x i32>, <8 x i32>, i1) + +define amdgpu_kernel void @test_wmma_f32_16x16x16_f16_kernel( +; GFX11-W32-LABEL: test_wmma_f32_16x16x16_f16_kernel: +; GFX11-W32: ; %bb.0: ; %entry +; GFX11-W32-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-W32-NEXT: v_mov_b32_e32 v24, 0 +; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W32-NEXT: s_load_b256 s[8:15], s[0:1], 0x0 +; GFX11-W32-NEXT: s_load_b256 s[16:23], s[2:3], 0x0 +; GFX11-W32-NEXT: s_load_b256 s[24:31], s[4:5], 0x0 +; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W32-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-W32-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 +; GFX11-W32-NEXT: v_dual_mov_b32 v16, s24 :: v_dual_mov_b32 v17, s25 +; GFX11-W32-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 +; GFX11-W32-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-W32-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; GFX11-W32-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 +; GFX11-W32-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v13, s21 +; GFX11-W32-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v15, s23 +; GFX11-W32-NEXT: v_dual_mov_b32 v18, s26 :: v_dual_mov_b32 v19, s27 +; GFX11-W32-NEXT: v_dual_mov_b32 v20, s28 :: v_dual_mov_b32 v21, s29 +; GFX11-W32-NEXT: v_dual_mov_b32 v22, s30 :: v_dual_mov_b32 v23, s31 +; GFX11-W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] +; GFX11-W32-NEXT: s_clause 0x1 +; GFX11-W32-NEXT: global_store_b128 v24, v[20:23], s[6:7] offset:16 +; GFX11-W32-NEXT: global_store_b128 v24, v[16:19], s[6:7] +; GFX11-W32-NEXT: s_nop 0 +; GFX11-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-W32-NEXT: s_endpgm + ptr addrspace(1) %a_ptr, + ptr addrspace(1) %b_ptr, + ptr addrspace(1) %c_ptr, + ptr addrspace(1) %out) { +entry: + %a = load <16 x half>, ptr addrspace(1) %a_ptr, align 32 + %b = load <16 x half>, ptr addrspace(1) %b_ptr, align 32 + %c = load <8 x float>, ptr addrspace(1) %c_ptr, align 32 + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %a, <16 x half> %b, <8 x float> %c) + store <8 x float> %res, ptr addrspace(1) %out, align 32 + ret void +} + +define amdgpu_kernel void @test_wmma_f32_16x16x16_bf16_kernel( +; GFX11-W32-LABEL: test_wmma_f32_16x16x16_bf16_kernel: +; GFX11-W32: ; %bb.0: ; %entry +; GFX11-W32-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-W32-NEXT: v_mov_b32_e32 v24, 0 +; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W32-NEXT: s_load_b256 s[8:15], s[0:1], 0x0 +; GFX11-W32-NEXT: s_load_b256 s[16:23], s[2:3], 0x0 +; GFX11-W32-NEXT: s_load_b256 s[24:31], s[4:5], 0x0 +; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W32-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-W32-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 +; GFX11-W32-NEXT: v_dual_mov_b32 v16, s24 :: v_dual_mov_b32 v17, s25 +; GFX11-W32-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 +; GFX11-W32-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-W32-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; GFX11-W32-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 +; GFX11-W32-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v13, s21 +; GFX11-W32-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v15, s23 +; GFX11-W32-NEXT: v_dual_mov_b32 v18, s26 :: v_dual_mov_b32 v19, s27 +; GFX11-W32-NEXT: v_dual_mov_b32 v20, s28 :: v_dual_mov_b32 v21, s29 +; GFX11-W32-NEXT: v_dual_mov_b32 v22, s30 :: v_dual_mov_b32 v23, s31 +; GFX11-W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] +; GFX11-W32-NEXT: s_clause 0x1 +; GFX11-W32-NEXT: global_store_b128 v24, v[20:23], s[6:7] offset:16 +; GFX11-W32-NEXT: global_store_b128 v24, v[16:19], s[6:7] +; GFX11-W32-NEXT: s_nop 0 +; GFX11-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-W32-NEXT: s_endpgm + ptr addrspace(1) %a_ptr, + ptr addrspace(1) %b_ptr, + ptr addrspace(1) %c_ptr, + ptr addrspace(1) %out) { +entry: + %a = load <16 x i16>, ptr addrspace(1) %a_ptr, align 32 + %b = load <16 x i16>, ptr addrspace(1) %b_ptr, align 32 + %c = load <8 x float>, ptr addrspace(1) %c_ptr, align 32 + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %a, <16 x i16> %b, <8 x float> %c) + store <8 x float> %res, ptr addrspace(1) %out, align 32 + ret void +} + +define amdgpu_kernel void @test_wmma_i32_16x16x16_iu8_kernel( +; GFX11-W32-LABEL: test_wmma_i32_16x16x16_iu8_kernel: +; GFX11-W32: ; %bb.0: ; %entry +; GFX11-W32-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-W32-NEXT: v_mov_b32_e32 v16, 0 +; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W32-NEXT: s_load_b128 s[16:19], s[0:1], 0x0 +; GFX11-W32-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-W32-NEXT: s_load_b256 s[8:15], s[4:5], 0x0 +; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W32-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 +; GFX11-W32-NEXT: v_dual_mov_b32 v15, s3 :: v_dual_mov_b32 v14, s2 +; GFX11-W32-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-W32-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 +; GFX11-W32-NEXT: v_dual_mov_b32 v13, s1 :: v_dual_mov_b32 v12, s0 +; GFX11-W32-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 +; GFX11-W32-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-W32-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; GFX11-W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-W32-NEXT: v_wmma_i32_16x16x16_iu8 v[0:7], v[8:11], v[12:15], v[0:7] +; GFX11-W32-NEXT: s_clause 0x1 +; GFX11-W32-NEXT: global_store_b128 v16, v[4:7], s[6:7] offset:16 +; GFX11-W32-NEXT: global_store_b128 v16, v[0:3], s[6:7] +; GFX11-W32-NEXT: s_nop 0 +; GFX11-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-W32-NEXT: s_endpgm + ptr addrspace(1) %a_ptr, + ptr addrspace(1) %b_ptr, + ptr addrspace(1) %c_ptr, + ptr addrspace(1) %out) { +entry: + %a = load <4 x i32>, ptr addrspace(1) %a_ptr, align 16 + %b = load <4 x i32>, ptr addrspace(1) %b_ptr, align 16 + %c = load <8 x i32>, ptr addrspace(1) %c_ptr, align 32 + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %a, i1 0, <4 x i32> %b, <8 x i32> %c, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out, align 32 + ret void +} + +define amdgpu_kernel void @test_wmma_i32_16x16x16_iu4_kernel( +; GFX11-W32-LABEL: test_wmma_i32_16x16x16_iu4_kernel: +; GFX11-W32: ; %bb.0: ; %entry +; GFX11-W32-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-W32-NEXT: v_mov_b32_e32 v12, 0 +; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-W32-NEXT: s_load_b256 s[8:15], s[4:5], 0x0 +; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W32-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0 +; GFX11-W32-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-W32-NEXT: v_dual_mov_b32 v11, s3 :: v_dual_mov_b32 v10, s2 +; GFX11-W32-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 +; GFX11-W32-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-W32-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; GFX11-W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-W32-NEXT: v_wmma_i32_16x16x16_iu4 v[0:7], v[8:9], v[10:11], v[0:7] +; GFX11-W32-NEXT: s_clause 0x1 +; GFX11-W32-NEXT: global_store_b128 v12, v[4:7], s[6:7] offset:16 +; GFX11-W32-NEXT: global_store_b128 v12, v[0:3], s[6:7] +; GFX11-W32-NEXT: s_nop 0 +; GFX11-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-W32-NEXT: s_endpgm + ptr addrspace(1) %a_ptr, + ptr addrspace(1) %b_ptr, + ptr addrspace(1) %c_ptr, + ptr addrspace(1) %out) { +entry: + %a = load <2 x i32>, ptr addrspace(1) %a_ptr, align 8 + %b = load <2 x i32>, ptr addrspace(1) %b_ptr, align 8 + %c = load <8 x i32>, ptr addrspace(1) %c_ptr, align 32 + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %a, i1 0, <2 x i32> %b, <8 x i32> %c, i1 0) + store <8 x i32> %res, ptr addrspace(1) %out, align 32 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx11-kernel-w64.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx11-kernel-w64.ll new file mode 100644 index 0000000000000..f36addd8f216b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx11-kernel-w64.ll @@ -0,0 +1,186 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX11-W64 + +; Test GFX11 WMMA with amdgpu_kernel (compute) calling convention - Wave64 mode +; Wave64 uses smaller accumulator vectors compared to Wave32 + +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1, <4 x i32>, i1, <4 x i32>, <4 x i32>, i1) +declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1, <2 x i32>, i1, <2 x i32>, <4 x i32>, i1) + +define amdgpu_kernel void @test_wmma_f32_16x16x16_f16_kernel_w64( +; GFX11-W64-LABEL: test_wmma_f32_16x16x16_f16_kernel_w64: +; GFX11-W64: ; %bb.0: ; %entry +; GFX11-W64-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-W64-NEXT: v_mov_b32_e32 v20, 0 +; GFX11-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W64-NEXT: s_load_b256 s[8:15], s[0:1], 0x0 +; GFX11-W64-NEXT: s_load_b256 s[16:23], s[2:3], 0x0 +; GFX11-W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W64-NEXT: v_mov_b32_e32 v0, s8 +; GFX11-W64-NEXT: v_mov_b32_e32 v8, s16 +; GFX11-W64-NEXT: v_mov_b32_e32 v19, s3 +; GFX11-W64-NEXT: v_mov_b32_e32 v1, s9 +; GFX11-W64-NEXT: v_mov_b32_e32 v2, s10 +; GFX11-W64-NEXT: v_mov_b32_e32 v3, s11 +; GFX11-W64-NEXT: v_mov_b32_e32 v4, s12 +; GFX11-W64-NEXT: v_mov_b32_e32 v5, s13 +; GFX11-W64-NEXT: v_mov_b32_e32 v6, s14 +; GFX11-W64-NEXT: v_mov_b32_e32 v7, s15 +; GFX11-W64-NEXT: v_mov_b32_e32 v9, s17 +; GFX11-W64-NEXT: v_mov_b32_e32 v10, s18 +; GFX11-W64-NEXT: v_mov_b32_e32 v11, s19 +; GFX11-W64-NEXT: v_mov_b32_e32 v12, s20 +; GFX11-W64-NEXT: v_mov_b32_e32 v13, s21 +; GFX11-W64-NEXT: v_mov_b32_e32 v14, s22 +; GFX11-W64-NEXT: v_mov_b32_e32 v15, s23 +; GFX11-W64-NEXT: v_mov_b32_e32 v18, s2 +; GFX11-W64-NEXT: v_mov_b32_e32 v17, s1 +; GFX11-W64-NEXT: v_mov_b32_e32 v16, s0 +; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] +; GFX11-W64-NEXT: global_store_b128 v20, v[16:19], s[6:7] +; GFX11-W64-NEXT: s_nop 0 +; GFX11-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-W64-NEXT: s_endpgm + ptr addrspace(1) %a_ptr, + ptr addrspace(1) %b_ptr, + ptr addrspace(1) %c_ptr, + ptr addrspace(1) %out) { +entry: + %a = load <16 x half>, ptr addrspace(1) %a_ptr, align 32 + %b = load <16 x half>, ptr addrspace(1) %b_ptr, align 32 + %c = load <4 x float>, ptr addrspace(1) %c_ptr, align 16 + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %a, <16 x half> %b, <4 x float> %c) + store <4 x float> %res, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @test_wmma_f32_16x16x16_bf16_kernel_w64( +; GFX11-W64-LABEL: test_wmma_f32_16x16x16_bf16_kernel_w64: +; GFX11-W64: ; %bb.0: ; %entry +; GFX11-W64-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-W64-NEXT: v_mov_b32_e32 v20, 0 +; GFX11-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W64-NEXT: s_load_b256 s[8:15], s[0:1], 0x0 +; GFX11-W64-NEXT: s_load_b256 s[16:23], s[2:3], 0x0 +; GFX11-W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W64-NEXT: v_mov_b32_e32 v0, s8 +; GFX11-W64-NEXT: v_mov_b32_e32 v8, s16 +; GFX11-W64-NEXT: v_mov_b32_e32 v19, s3 +; GFX11-W64-NEXT: v_mov_b32_e32 v1, s9 +; GFX11-W64-NEXT: v_mov_b32_e32 v2, s10 +; GFX11-W64-NEXT: v_mov_b32_e32 v3, s11 +; GFX11-W64-NEXT: v_mov_b32_e32 v4, s12 +; GFX11-W64-NEXT: v_mov_b32_e32 v5, s13 +; GFX11-W64-NEXT: v_mov_b32_e32 v6, s14 +; GFX11-W64-NEXT: v_mov_b32_e32 v7, s15 +; GFX11-W64-NEXT: v_mov_b32_e32 v9, s17 +; GFX11-W64-NEXT: v_mov_b32_e32 v10, s18 +; GFX11-W64-NEXT: v_mov_b32_e32 v11, s19 +; GFX11-W64-NEXT: v_mov_b32_e32 v12, s20 +; GFX11-W64-NEXT: v_mov_b32_e32 v13, s21 +; GFX11-W64-NEXT: v_mov_b32_e32 v14, s22 +; GFX11-W64-NEXT: v_mov_b32_e32 v15, s23 +; GFX11-W64-NEXT: v_mov_b32_e32 v18, s2 +; GFX11-W64-NEXT: v_mov_b32_e32 v17, s1 +; GFX11-W64-NEXT: v_mov_b32_e32 v16, s0 +; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] +; GFX11-W64-NEXT: global_store_b128 v20, v[16:19], s[6:7] +; GFX11-W64-NEXT: s_nop 0 +; GFX11-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-W64-NEXT: s_endpgm + ptr addrspace(1) %a_ptr, + ptr addrspace(1) %b_ptr, + ptr addrspace(1) %c_ptr, + ptr addrspace(1) %out) { +entry: + %a = load <16 x i16>, ptr addrspace(1) %a_ptr, align 32 + %b = load <16 x i16>, ptr addrspace(1) %b_ptr, align 32 + %c = load <4 x float>, ptr addrspace(1) %c_ptr, align 16 + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %a, <16 x i16> %b, <4 x float> %c) + store <4 x float> %res, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @test_wmma_i32_16x16x16_iu8_kernel_w64( +; GFX11-W64-LABEL: test_wmma_i32_16x16x16_iu8_kernel_w64: +; GFX11-W64: ; %bb.0: ; %entry +; GFX11-W64-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-W64-NEXT: v_mov_b32_e32 v12, 0 +; GFX11-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x0 +; GFX11-W64-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-W64-NEXT: s_load_b128 s[12:15], s[4:5], 0x0 +; GFX11-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W64-NEXT: v_mov_b32_e32 v0, s8 +; GFX11-W64-NEXT: v_mov_b32_e32 v7, s3 +; GFX11-W64-NEXT: v_mov_b32_e32 v8, s12 +; GFX11-W64-NEXT: v_mov_b32_e32 v1, s9 +; GFX11-W64-NEXT: v_mov_b32_e32 v2, s10 +; GFX11-W64-NEXT: v_mov_b32_e32 v3, s11 +; GFX11-W64-NEXT: v_mov_b32_e32 v6, s2 +; GFX11-W64-NEXT: v_mov_b32_e32 v5, s1 +; GFX11-W64-NEXT: v_mov_b32_e32 v4, s0 +; GFX11-W64-NEXT: v_mov_b32_e32 v9, s13 +; GFX11-W64-NEXT: v_mov_b32_e32 v10, s14 +; GFX11-W64-NEXT: v_mov_b32_e32 v11, s15 +; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] +; GFX11-W64-NEXT: global_store_b128 v12, v[8:11], s[6:7] +; GFX11-W64-NEXT: s_nop 0 +; GFX11-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-W64-NEXT: s_endpgm + ptr addrspace(1) %a_ptr, + ptr addrspace(1) %b_ptr, + ptr addrspace(1) %c_ptr, + ptr addrspace(1) %out) { +entry: + %a = load <4 x i32>, ptr addrspace(1) %a_ptr, align 16 + %b = load <4 x i32>, ptr addrspace(1) %b_ptr, align 16 + %c = load <4 x i32>, ptr addrspace(1) %c_ptr, align 16 + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %a, i1 0, <4 x i32> %b, <4 x i32> %c, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_kernel void @test_wmma_i32_16x16x16_iu4_kernel_w64( +; GFX11-W64-LABEL: test_wmma_i32_16x16x16_iu4_kernel_w64: +; GFX11-W64: ; %bb.0: ; %entry +; GFX11-W64-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 +; GFX11-W64-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W64-NEXT: s_load_b64 s[8:9], s[0:1], 0x0 +; GFX11-W64-NEXT: s_load_b64 s[10:11], s[2:3], 0x0 +; GFX11-W64-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-W64-NEXT: v_mov_b32_e32 v4, s8 +; GFX11-W64-NEXT: v_mov_b32_e32 v6, s10 +; GFX11-W64-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-W64-NEXT: v_mov_b32_e32 v5, s9 +; GFX11-W64-NEXT: v_mov_b32_e32 v7, s11 +; GFX11-W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-W64-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-W64-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-W64-NEXT: v_wmma_i32_16x16x16_iu4 v[0:3], v[4:5], v[6:7], v[0:3] +; GFX11-W64-NEXT: global_store_b128 v8, v[0:3], s[6:7] +; GFX11-W64-NEXT: s_nop 0 +; GFX11-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-W64-NEXT: s_endpgm + ptr addrspace(1) %a_ptr, + ptr addrspace(1) %b_ptr, + ptr addrspace(1) %c_ptr, + ptr addrspace(1) %out) { +entry: + %a = load <2 x i32>, ptr addrspace(1) %a_ptr, align 8 + %b = load <2 x i32>, ptr addrspace(1) %b_ptr, align 8 + %c = load <4 x i32>, ptr addrspace(1) %c_ptr, align 16 + %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %a, i1 0, <2 x i32> %b, <4 x i32> %c, i1 0) + store <4 x i32> %res, ptr addrspace(1) %out, align 16 + ret void +}