-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Fix GFX11 WMMA intrinsic lowering regression for compute kernels #164036
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -1452,6 +1452,66 @@ let WaveSizePredicate = isWave64 in { | |||||
|
|
||||||
| } | ||||||
|
|
||||||
| // GFX11 RDNA3 WMMA patterns for bare intrinsic calls (no explicit modifiers) | ||||||
| // Match intrinsics directly and provide zero modifiers to the instruction | ||||||
| // High AddedComplexity ensures these beat the broken WMMARegularPat patterns | ||||||
|
|
||||||
| // Wave32 patterns (RDNA3 native wave size) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| let SubtargetPredicate = isGFX11Only, WaveSizePredicate = isWave32 in { | ||||||
|
|
||||||
| // FP16 WMMA: <8 x float> = wmma(<16 x half>, <16 x half>, <8 x float>) | ||||||
| def : GCNPat < | ||||||
| (v8f32 (int_amdgcn_wmma_f32_16x16x16_f16 v16f16:$a, v16f16:$b, v8f32:$c)), | ||||||
| (v8f32 (V_WMMA_F32_16X16X16_F16_twoaddr_w32 (i32 0), v16f16:$a, (i32 0), v16f16:$b, (i32 0), v8f32:$c)) | ||||||
| > { | ||||||
| let AddedComplexity = 10000; | ||||||
| } | ||||||
|
|
||||||
| // BF16 WMMA: <8 x float> = wmma(<16 x i16>, <16 x i16>, <8 x float>) | ||||||
| def : GCNPat < | ||||||
| (v8f32 (int_amdgcn_wmma_f32_16x16x16_bf16 v16i16:$a, v16i16:$b, v8f32:$c)), | ||||||
| (v8f32 (V_WMMA_F32_16X16X16_BF16_twoaddr_w32 (i32 0), v16i16:$a, (i32 0), v16i16:$b, (i32 0), v8f32:$c)) | ||||||
| > { | ||||||
| let AddedComplexity = 10000; | ||||||
| } | ||||||
|
|
||||||
| // INT8 WMMA: <8 x i32> = wmma(i1, <4 x i32>, i1, <4 x i32>, <8 x i32>, i1) | ||||||
| def : GCNPat < | ||||||
| (v8i32 (int_amdgcn_wmma_i32_16x16x16_iu8 i1:$a_neg, v4i32:$a, i1:$b_neg, v4i32:$b, v8i32:$c, i1:$clamp)), | ||||||
| (v8i32 (V_WMMA_I32_16X16X16_IU8_twoaddr_w32 (VOP3PModsNeg $a_neg), v4i32:$a, (VOP3PModsNeg $b_neg), v4i32:$b, (i32 8), v8i32:$c, i1:$clamp)) | ||||||
| > { | ||||||
| let AddedComplexity = 10000; | ||||||
| } | ||||||
|
|
||||||
| // INT4 WMMA: <8 x i32> = wmma(i1, <2 x i32>, i1, <2 x i32>, <8 x i32>, i1) | ||||||
| def : GCNPat < | ||||||
| (v8i32 (int_amdgcn_wmma_i32_16x16x16_iu4 i1:$a_neg, v2i32:$a, i1:$b_neg, v2i32:$b, v8i32:$c, i1:$clamp)), | ||||||
| (v8i32 (V_WMMA_I32_16X16X16_IU4_twoaddr_w32 (VOP3PModsNeg $a_neg), v2i32:$a, (VOP3PModsNeg $b_neg), v2i32:$b, (i32 8), v8i32:$c, i1:$clamp)) | ||||||
| > { | ||||||
| let AddedComplexity = 10000; | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| // Wave64 patterns (compatibility mode) | ||||||
| let SubtargetPredicate = isGFX11Only, WaveSizePredicate = isWave64 in { | ||||||
|
|
||||||
| // FP16 WMMA Wave64: <4 x float> = wmma(<16 x half>, <16 x half>, <4 x float>) | ||||||
| def : GCNPat < | ||||||
| (v4f32 (int_amdgcn_wmma_f32_16x16x16_f16 v16f16:$a, v16f16:$b, v4f32:$c)), | ||||||
| (v4f32 (V_WMMA_F32_16X16X16_F16_twoaddr_w64 (i32 0), v16f16:$a, (i32 0), v16f16:$b, (i32 0), v4f32:$c)) | ||||||
| > { | ||||||
| let AddedComplexity = 10000; | ||||||
| } | ||||||
|
|
||||||
| // BF16 WMMA Wave64: <4 x float> = wmma(<16 x i16>, <16 x i16>, <4 x float>) | ||||||
| def : GCNPat < | ||||||
| (v4f32 (int_amdgcn_wmma_f32_16x16x16_bf16 v16i16:$a, v16i16:$b, v4f32:$c)), | ||||||
| (v4f32 (V_WMMA_F32_16X16X16_BF16_twoaddr_w64 (i32 0), v16i16:$a, (i32 0), v16i16:$b, (i32 0), v4f32:$c)) | ||||||
| > { | ||||||
| let AddedComplexity = 10000; | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, | ||||||
| bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0, | ||||||
| bit _HasMatrixFMT = 0, bit _HasMatrixScale = 0, | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,172 @@ | ||||||
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ||||||
| ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX11-W32 | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
|
||||||
| ; Test GFX11 WMMA with amdgpu_kernel (compute) calling convention | ||||||
| ; This test is critical to prevent regression of compute kernel WMMA support | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't need this line. Most tests are critical to prevent regression of whatever they're testing. |
||||||
|
|
||||||
| declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <8 x float>) | ||||||
| declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x float>) | ||||||
| declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1, <4 x i32>, i1, <4 x i32>, <8 x i32>, i1) | ||||||
| declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1, <2 x i32>, i1, <2 x i32>, <8 x i32>, i1) | ||||||
|
Comment on lines
+7
to
+10
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You don't need to declare intrinsics (although some people still like to for reasons I don't understand). |
||||||
|
|
||||||
| define amdgpu_kernel void @test_wmma_f32_16x16x16_f16_kernel( | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does the kernel part really matter, or is it !graphics? Can you also test this just in a regular CCC function? |
||||||
| ; GFX11-W32-LABEL: test_wmma_f32_16x16x16_f16_kernel: | ||||||
| ; GFX11-W32: ; %bb.0: ; %entry | ||||||
| ; GFX11-W32-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 | ||||||
| ; GFX11-W32-NEXT: v_mov_b32_e32 v24, 0 | ||||||
| ; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) | ||||||
| ; GFX11-W32-NEXT: s_load_b256 s[8:15], s[0:1], 0x0 | ||||||
| ; GFX11-W32-NEXT: s_load_b256 s[16:23], s[2:3], 0x0 | ||||||
| ; GFX11-W32-NEXT: s_load_b256 s[24:31], s[4:5], 0x0 | ||||||
| ; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v16, s24 :: v_dual_mov_b32 v17, s25 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v13, s21 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v15, s23 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v18, s26 :: v_dual_mov_b32 v19, s27 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v20, s28 :: v_dual_mov_b32 v21, s29 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v22, s30 :: v_dual_mov_b32 v23, s31 | ||||||
| ; GFX11-W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | ||||||
| ; GFX11-W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] | ||||||
| ; GFX11-W32-NEXT: s_clause 0x1 | ||||||
| ; GFX11-W32-NEXT: global_store_b128 v24, v[20:23], s[6:7] offset:16 | ||||||
| ; GFX11-W32-NEXT: global_store_b128 v24, v[16:19], s[6:7] | ||||||
| ; GFX11-W32-NEXT: s_nop 0 | ||||||
| ; GFX11-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) | ||||||
| ; GFX11-W32-NEXT: s_endpgm | ||||||
| ptr addrspace(1) %a_ptr, | ||||||
| ptr addrspace(1) %b_ptr, | ||||||
| ptr addrspace(1) %c_ptr, | ||||||
| ptr addrspace(1) %out) { | ||||||
| entry: | ||||||
| %a = load <16 x half>, ptr addrspace(1) %a_ptr, align 32 | ||||||
| %b = load <16 x half>, ptr addrspace(1) %b_ptr, align 32 | ||||||
| %c = load <8 x float>, ptr addrspace(1) %c_ptr, align 32 | ||||||
| %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %a, <16 x half> %b, <8 x float> %c) | ||||||
| store <8 x float> %res, ptr addrspace(1) %out, align 32 | ||||||
| ret void | ||||||
| } | ||||||
|
|
||||||
| define amdgpu_kernel void @test_wmma_f32_16x16x16_bf16_kernel( | ||||||
| ; GFX11-W32-LABEL: test_wmma_f32_16x16x16_bf16_kernel: | ||||||
| ; GFX11-W32: ; %bb.0: ; %entry | ||||||
| ; GFX11-W32-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 | ||||||
| ; GFX11-W32-NEXT: v_mov_b32_e32 v24, 0 | ||||||
| ; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) | ||||||
| ; GFX11-W32-NEXT: s_load_b256 s[8:15], s[0:1], 0x0 | ||||||
| ; GFX11-W32-NEXT: s_load_b256 s[16:23], s[2:3], 0x0 | ||||||
| ; GFX11-W32-NEXT: s_load_b256 s[24:31], s[4:5], 0x0 | ||||||
| ; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v16, s24 :: v_dual_mov_b32 v17, s25 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v13, s21 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v15, s23 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v18, s26 :: v_dual_mov_b32 v19, s27 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v20, s28 :: v_dual_mov_b32 v21, s29 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v22, s30 :: v_dual_mov_b32 v23, s31 | ||||||
| ; GFX11-W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | ||||||
| ; GFX11-W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] | ||||||
| ; GFX11-W32-NEXT: s_clause 0x1 | ||||||
| ; GFX11-W32-NEXT: global_store_b128 v24, v[20:23], s[6:7] offset:16 | ||||||
| ; GFX11-W32-NEXT: global_store_b128 v24, v[16:19], s[6:7] | ||||||
| ; GFX11-W32-NEXT: s_nop 0 | ||||||
| ; GFX11-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) | ||||||
| ; GFX11-W32-NEXT: s_endpgm | ||||||
| ptr addrspace(1) %a_ptr, | ||||||
| ptr addrspace(1) %b_ptr, | ||||||
| ptr addrspace(1) %c_ptr, | ||||||
| ptr addrspace(1) %out) { | ||||||
| entry: | ||||||
| %a = load <16 x i16>, ptr addrspace(1) %a_ptr, align 32 | ||||||
| %b = load <16 x i16>, ptr addrspace(1) %b_ptr, align 32 | ||||||
| %c = load <8 x float>, ptr addrspace(1) %c_ptr, align 32 | ||||||
| %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %a, <16 x i16> %b, <8 x float> %c) | ||||||
| store <8 x float> %res, ptr addrspace(1) %out, align 32 | ||||||
| ret void | ||||||
| } | ||||||
|
|
||||||
| define amdgpu_kernel void @test_wmma_i32_16x16x16_iu8_kernel( | ||||||
| ; GFX11-W32-LABEL: test_wmma_i32_16x16x16_iu8_kernel: | ||||||
| ; GFX11-W32: ; %bb.0: ; %entry | ||||||
| ; GFX11-W32-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 | ||||||
| ; GFX11-W32-NEXT: v_mov_b32_e32 v16, 0 | ||||||
| ; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) | ||||||
| ; GFX11-W32-NEXT: s_load_b128 s[16:19], s[0:1], 0x0 | ||||||
| ; GFX11-W32-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 | ||||||
| ; GFX11-W32-NEXT: s_load_b256 s[8:15], s[4:5], 0x0 | ||||||
| ; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v15, s3 :: v_dual_mov_b32 v14, s2 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v13, s1 :: v_dual_mov_b32 v12, s0 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 | ||||||
| ; GFX11-W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | ||||||
| ; GFX11-W32-NEXT: v_wmma_i32_16x16x16_iu8 v[0:7], v[8:11], v[12:15], v[0:7] | ||||||
| ; GFX11-W32-NEXT: s_clause 0x1 | ||||||
| ; GFX11-W32-NEXT: global_store_b128 v16, v[4:7], s[6:7] offset:16 | ||||||
| ; GFX11-W32-NEXT: global_store_b128 v16, v[0:3], s[6:7] | ||||||
| ; GFX11-W32-NEXT: s_nop 0 | ||||||
| ; GFX11-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) | ||||||
| ; GFX11-W32-NEXT: s_endpgm | ||||||
| ptr addrspace(1) %a_ptr, | ||||||
| ptr addrspace(1) %b_ptr, | ||||||
| ptr addrspace(1) %c_ptr, | ||||||
| ptr addrspace(1) %out) { | ||||||
| entry: | ||||||
| %a = load <4 x i32>, ptr addrspace(1) %a_ptr, align 16 | ||||||
| %b = load <4 x i32>, ptr addrspace(1) %b_ptr, align 16 | ||||||
| %c = load <8 x i32>, ptr addrspace(1) %c_ptr, align 32 | ||||||
| %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %a, i1 0, <4 x i32> %b, <8 x i32> %c, i1 0) | ||||||
| store <8 x i32> %res, ptr addrspace(1) %out, align 32 | ||||||
| ret void | ||||||
| } | ||||||
|
|
||||||
| define amdgpu_kernel void @test_wmma_i32_16x16x16_iu4_kernel( | ||||||
| ; GFX11-W32-LABEL: test_wmma_i32_16x16x16_iu4_kernel: | ||||||
| ; GFX11-W32: ; %bb.0: ; %entry | ||||||
| ; GFX11-W32-NEXT: s_load_b256 s[0:7], s[2:3], 0x0 | ||||||
| ; GFX11-W32-NEXT: v_mov_b32_e32 v12, 0 | ||||||
| ; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) | ||||||
| ; GFX11-W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 | ||||||
| ; GFX11-W32-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 | ||||||
| ; GFX11-W32-NEXT: s_load_b256 s[8:15], s[4:5], 0x0 | ||||||
| ; GFX11-W32-NEXT: s_waitcnt lgkmcnt(0) | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v11, s3 :: v_dual_mov_b32 v10, s2 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 | ||||||
| ; GFX11-W32-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 | ||||||
| ; GFX11-W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | ||||||
| ; GFX11-W32-NEXT: v_wmma_i32_16x16x16_iu4 v[0:7], v[8:9], v[10:11], v[0:7] | ||||||
| ; GFX11-W32-NEXT: s_clause 0x1 | ||||||
| ; GFX11-W32-NEXT: global_store_b128 v12, v[4:7], s[6:7] offset:16 | ||||||
| ; GFX11-W32-NEXT: global_store_b128 v12, v[0:3], s[6:7] | ||||||
| ; GFX11-W32-NEXT: s_nop 0 | ||||||
| ; GFX11-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) | ||||||
| ; GFX11-W32-NEXT: s_endpgm | ||||||
| ptr addrspace(1) %a_ptr, | ||||||
| ptr addrspace(1) %b_ptr, | ||||||
| ptr addrspace(1) %c_ptr, | ||||||
| ptr addrspace(1) %out) { | ||||||
| entry: | ||||||
| %a = load <2 x i32>, ptr addrspace(1) %a_ptr, align 8 | ||||||
| %b = load <2 x i32>, ptr addrspace(1) %b_ptr, align 8 | ||||||
| %c = load <8 x i32>, ptr addrspace(1) %c_ptr, align 32 | ||||||
| %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %a, i1 0, <2 x i32> %b, <8 x i32> %c, i1 0) | ||||||
| store <8 x i32> %res, ptr addrspace(1) %out, align 32 | ||||||
| ret void | ||||||
| } | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The pattern can't be broken, should not be avoiding broken patterns with AddedComplexity