Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3PInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1452,6 +1452,66 @@ let WaveSizePredicate = isWave64 in {

}

// GFX11 RDNA3 WMMA patterns for bare intrinsic calls (no explicit modifiers)
// Match intrinsics directly and provide zero modifiers to the instruction
// High AddedComplexity ensures these beat the broken WMMARegularPat patterns
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pattern can't be broken, should not be avoiding broken patterns with AddedComplexity


// Wave32 patterns (RDNA3 native wave size)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Wave32 patterns (RDNA3 native wave size)
// Wave32 patterns

let SubtargetPredicate = isGFX11Only, WaveSizePredicate = isWave32 in {

// FP16 WMMA: <8 x float> = wmma(<16 x half>, <16 x half>, <8 x float>)
def : GCNPat <
(v8f32 (int_amdgcn_wmma_f32_16x16x16_f16 v16f16:$a, v16f16:$b, v8f32:$c)),
(v8f32 (V_WMMA_F32_16X16X16_F16_twoaddr_w32 (i32 0), v16f16:$a, (i32 0), v16f16:$b, (i32 0), v8f32:$c))
> {
let AddedComplexity = 10000;
}

// BF16 WMMA: <8 x float> = wmma(<16 x i16>, <16 x i16>, <8 x float>)
def : GCNPat <
(v8f32 (int_amdgcn_wmma_f32_16x16x16_bf16 v16i16:$a, v16i16:$b, v8f32:$c)),
(v8f32 (V_WMMA_F32_16X16X16_BF16_twoaddr_w32 (i32 0), v16i16:$a, (i32 0), v16i16:$b, (i32 0), v8f32:$c))
> {
let AddedComplexity = 10000;
}

// INT8 WMMA: <8 x i32> = wmma(i1, <4 x i32>, i1, <4 x i32>, <8 x i32>, i1)
def : GCNPat <
(v8i32 (int_amdgcn_wmma_i32_16x16x16_iu8 i1:$a_neg, v4i32:$a, i1:$b_neg, v4i32:$b, v8i32:$c, i1:$clamp)),
(v8i32 (V_WMMA_I32_16X16X16_IU8_twoaddr_w32 (VOP3PModsNeg $a_neg), v4i32:$a, (VOP3PModsNeg $b_neg), v4i32:$b, (i32 8), v8i32:$c, i1:$clamp))
> {
let AddedComplexity = 10000;
}

// INT4 WMMA: <8 x i32> = wmma(i1, <2 x i32>, i1, <2 x i32>, <8 x i32>, i1)
def : GCNPat <
(v8i32 (int_amdgcn_wmma_i32_16x16x16_iu4 i1:$a_neg, v2i32:$a, i1:$b_neg, v2i32:$b, v8i32:$c, i1:$clamp)),
(v8i32 (V_WMMA_I32_16X16X16_IU4_twoaddr_w32 (VOP3PModsNeg $a_neg), v2i32:$a, (VOP3PModsNeg $b_neg), v2i32:$b, (i32 8), v8i32:$c, i1:$clamp))
> {
let AddedComplexity = 10000;
}
}

// Wave64 patterns (compatibility mode)
let SubtargetPredicate = isGFX11Only, WaveSizePredicate = isWave64 in {

// FP16 WMMA Wave64: <4 x float> = wmma(<16 x half>, <16 x half>, <4 x float>)
def : GCNPat <
(v4f32 (int_amdgcn_wmma_f32_16x16x16_f16 v16f16:$a, v16f16:$b, v4f32:$c)),
(v4f32 (V_WMMA_F32_16X16X16_F16_twoaddr_w64 (i32 0), v16f16:$a, (i32 0), v16f16:$b, (i32 0), v4f32:$c))
> {
let AddedComplexity = 10000;
}

// BF16 WMMA Wave64: <4 x float> = wmma(<16 x i16>, <16 x i16>, <4 x float>)
def : GCNPat <
(v4f32 (int_amdgcn_wmma_f32_16x16x16_bf16 v16i16:$a, v16i16:$b, v4f32:$c)),
(v4f32 (V_WMMA_F32_16X16X16_BF16_twoaddr_w64 (i32 0), v16i16:$a, (i32 0), v16i16:$b, (i32 0), v4f32:$c))
> {
let AddedComplexity = 10000;
}
}

class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0,
bit _HasMatrixFMT = 0, bit _HasMatrixScale = 0,
Expand Down
172 changes: 172 additions & 0 deletions llvm/test/CodeGen/AMDGPU/wmma-gfx11-kernel-w32.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX11-W32
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX11-W32
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s --check-prefix=GFX11-W32


; Test GFX11 WMMA with amdgpu_kernel (compute) calling convention
; This test is critical to prevent regression of compute kernel WMMA support
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't need this line. Most tests are critical to prevent regression of whatever they're testing.


declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x float>)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1, <4 x i32>, i1, <4 x i32>, <8 x i32>, i1)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1, <2 x i32>, i1, <2 x i32>, <8 x i32>, i1)
Comment on lines +7 to +10
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't need to declare intrinsics (although some people still like to for reasons I don't understand).


define amdgpu_kernel void @test_wmma_f32_16x16x16_f16_kernel(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the kernel part really matter, or is it !graphics? Can you also test this just in a regular CCC function?

; GFX11-W32-LABEL: test_wmma_f32_16x16x16_f16_kernel:
; GFX11-W32: ; %bb.0: ; %entry
; GFX11-W32-NEXT: s_load_b256 s[0:7], s[2:3], 0x0
; GFX11-W32-NEXT: v_mov_b32_e32 v24, 0
; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-W32-NEXT: s_load_b256 s[8:15], s[0:1], 0x0
; GFX11-W32-NEXT: s_load_b256 s[16:23], s[2:3], 0x0
; GFX11-W32-NEXT: s_load_b256 s[24:31], s[4:5], 0x0
; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-W32-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
; GFX11-W32-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17
; GFX11-W32-NEXT: v_dual_mov_b32 v16, s24 :: v_dual_mov_b32 v17, s25
; GFX11-W32-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX11-W32-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
; GFX11-W32-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
; GFX11-W32-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19
; GFX11-W32-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v13, s21
; GFX11-W32-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v15, s23
; GFX11-W32-NEXT: v_dual_mov_b32 v18, s26 :: v_dual_mov_b32 v19, s27
; GFX11-W32-NEXT: v_dual_mov_b32 v20, s28 :: v_dual_mov_b32 v21, s29
; GFX11-W32-NEXT: v_dual_mov_b32 v22, s30 :: v_dual_mov_b32 v23, s31
; GFX11-W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
; GFX11-W32-NEXT: s_clause 0x1
; GFX11-W32-NEXT: global_store_b128 v24, v[20:23], s[6:7] offset:16
; GFX11-W32-NEXT: global_store_b128 v24, v[16:19], s[6:7]
; GFX11-W32-NEXT: s_nop 0
; GFX11-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-W32-NEXT: s_endpgm
ptr addrspace(1) %a_ptr,
ptr addrspace(1) %b_ptr,
ptr addrspace(1) %c_ptr,
ptr addrspace(1) %out) {
entry:
%a = load <16 x half>, ptr addrspace(1) %a_ptr, align 32
%b = load <16 x half>, ptr addrspace(1) %b_ptr, align 32
%c = load <8 x float>, ptr addrspace(1) %c_ptr, align 32
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %a, <16 x half> %b, <8 x float> %c)
store <8 x float> %res, ptr addrspace(1) %out, align 32
ret void
}

define amdgpu_kernel void @test_wmma_f32_16x16x16_bf16_kernel(
; GFX11-W32-LABEL: test_wmma_f32_16x16x16_bf16_kernel:
; GFX11-W32: ; %bb.0: ; %entry
; GFX11-W32-NEXT: s_load_b256 s[0:7], s[2:3], 0x0
; GFX11-W32-NEXT: v_mov_b32_e32 v24, 0
; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-W32-NEXT: s_load_b256 s[8:15], s[0:1], 0x0
; GFX11-W32-NEXT: s_load_b256 s[16:23], s[2:3], 0x0
; GFX11-W32-NEXT: s_load_b256 s[24:31], s[4:5], 0x0
; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-W32-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
; GFX11-W32-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17
; GFX11-W32-NEXT: v_dual_mov_b32 v16, s24 :: v_dual_mov_b32 v17, s25
; GFX11-W32-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX11-W32-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
; GFX11-W32-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
; GFX11-W32-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19
; GFX11-W32-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v13, s21
; GFX11-W32-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v15, s23
; GFX11-W32-NEXT: v_dual_mov_b32 v18, s26 :: v_dual_mov_b32 v19, s27
; GFX11-W32-NEXT: v_dual_mov_b32 v20, s28 :: v_dual_mov_b32 v21, s29
; GFX11-W32-NEXT: v_dual_mov_b32 v22, s30 :: v_dual_mov_b32 v23, s31
; GFX11-W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
; GFX11-W32-NEXT: s_clause 0x1
; GFX11-W32-NEXT: global_store_b128 v24, v[20:23], s[6:7] offset:16
; GFX11-W32-NEXT: global_store_b128 v24, v[16:19], s[6:7]
; GFX11-W32-NEXT: s_nop 0
; GFX11-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-W32-NEXT: s_endpgm
ptr addrspace(1) %a_ptr,
ptr addrspace(1) %b_ptr,
ptr addrspace(1) %c_ptr,
ptr addrspace(1) %out) {
entry:
%a = load <16 x i16>, ptr addrspace(1) %a_ptr, align 32
%b = load <16 x i16>, ptr addrspace(1) %b_ptr, align 32
%c = load <8 x float>, ptr addrspace(1) %c_ptr, align 32
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %a, <16 x i16> %b, <8 x float> %c)
store <8 x float> %res, ptr addrspace(1) %out, align 32
ret void
}

define amdgpu_kernel void @test_wmma_i32_16x16x16_iu8_kernel(
; GFX11-W32-LABEL: test_wmma_i32_16x16x16_iu8_kernel:
; GFX11-W32: ; %bb.0: ; %entry
; GFX11-W32-NEXT: s_load_b256 s[0:7], s[2:3], 0x0
; GFX11-W32-NEXT: v_mov_b32_e32 v16, 0
; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-W32-NEXT: s_load_b128 s[16:19], s[0:1], 0x0
; GFX11-W32-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
; GFX11-W32-NEXT: s_load_b256 s[8:15], s[4:5], 0x0
; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-W32-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17
; GFX11-W32-NEXT: v_dual_mov_b32 v15, s3 :: v_dual_mov_b32 v14, s2
; GFX11-W32-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
; GFX11-W32-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19
; GFX11-W32-NEXT: v_dual_mov_b32 v13, s1 :: v_dual_mov_b32 v12, s0
; GFX11-W32-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX11-W32-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
; GFX11-W32-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
; GFX11-W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-W32-NEXT: v_wmma_i32_16x16x16_iu8 v[0:7], v[8:11], v[12:15], v[0:7]
; GFX11-W32-NEXT: s_clause 0x1
; GFX11-W32-NEXT: global_store_b128 v16, v[4:7], s[6:7] offset:16
; GFX11-W32-NEXT: global_store_b128 v16, v[0:3], s[6:7]
; GFX11-W32-NEXT: s_nop 0
; GFX11-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-W32-NEXT: s_endpgm
ptr addrspace(1) %a_ptr,
ptr addrspace(1) %b_ptr,
ptr addrspace(1) %c_ptr,
ptr addrspace(1) %out) {
entry:
%a = load <4 x i32>, ptr addrspace(1) %a_ptr, align 16
%b = load <4 x i32>, ptr addrspace(1) %b_ptr, align 16
%c = load <8 x i32>, ptr addrspace(1) %c_ptr, align 32
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %a, i1 0, <4 x i32> %b, <8 x i32> %c, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out, align 32
ret void
}

define amdgpu_kernel void @test_wmma_i32_16x16x16_iu4_kernel(
; GFX11-W32-LABEL: test_wmma_i32_16x16x16_iu4_kernel:
; GFX11-W32: ; %bb.0: ; %entry
; GFX11-W32-NEXT: s_load_b256 s[0:7], s[2:3], 0x0
; GFX11-W32-NEXT: v_mov_b32_e32 v12, 0
; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-W32-NEXT: s_load_b256 s[8:15], s[4:5], 0x0
; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-W32-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0
; GFX11-W32-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
; GFX11-W32-NEXT: v_dual_mov_b32 v11, s3 :: v_dual_mov_b32 v10, s2
; GFX11-W32-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX11-W32-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
; GFX11-W32-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
; GFX11-W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-W32-NEXT: v_wmma_i32_16x16x16_iu4 v[0:7], v[8:9], v[10:11], v[0:7]
; GFX11-W32-NEXT: s_clause 0x1
; GFX11-W32-NEXT: global_store_b128 v12, v[4:7], s[6:7] offset:16
; GFX11-W32-NEXT: global_store_b128 v12, v[0:3], s[6:7]
; GFX11-W32-NEXT: s_nop 0
; GFX11-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-W32-NEXT: s_endpgm
ptr addrspace(1) %a_ptr,
ptr addrspace(1) %b_ptr,
ptr addrspace(1) %c_ptr,
ptr addrspace(1) %out) {
entry:
%a = load <2 x i32>, ptr addrspace(1) %a_ptr, align 8
%b = load <2 x i32>, ptr addrspace(1) %b_ptr, align 8
%c = load <8 x i32>, ptr addrspace(1) %c_ptr, align 32
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %a, i1 0, <2 x i32> %b, <8 x i32> %c, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out, align 32
ret void
}
Loading