-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Use correct DWord for v_dot4 S0 operand #115224
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Change-Id: Ifc201f58eddd8f8994690bacbf34f446ccf2a790
|
@llvm/pr-subscribers-backend-amdgpu Author: Jeffrey Byrnes (jrbyrnes) ChangesFixes a copy-paste typo. Patch is 42.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115224.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1a962e68c587c7..419414e5bd993d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14011,7 +14011,7 @@ static void placeSources(ByteProvider<SDValue> &Src0,
Src0s.push_back(
{*Src0.Src,
((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
- Src1.SrcOffset / 4});
+ Src0.SrcOffset / 4});
Src1s.push_back(
{*Src1.Src,
((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 108d85e024ad76..15734094db42cd 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -3450,4 +3450,850 @@ entry:
}
+define amdgpu_kernel void @ByteOffsetCorrectness(ptr addrspace(1) noalias readonly align 16 %inptr0, ptr addrspace(1) noalias readonly align 16 %inptr1, ptr addrspace(1) noalias align 16 %inptr2) local_unnamed_addr {
+; GFX7-LABEL: ByteOffsetCorrectness:
+; GFX7: ; %bb.0: ; %.entry
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1]
+; GFX7-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7-NEXT: s_cbranch_execz .LBB17_5
+; GFX7-NEXT: ; %bb.1: ; %.lr.ph.preheader
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v2
+; GFX7-NEXT: v_mul_hi_u32_u24_e32 v2, 0x48, v0
+; GFX7-NEXT: v_mul_u32_u24_e32 v1, 0x48, v0
+; GFX7-NEXT: s_movk_i32 s0, 0x900
+; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v3, s0, v[1:2]
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 5, v3
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, v6, v0
+; GFX7-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, s11
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, s10, v4
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; GFX7-NEXT: v_mov_b32_e32 v6, s9
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, s8, v4
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: v_mov_b32_e32 v6, 0x48
+; GFX7-NEXT: s_movk_i32 s10, 0xffe1
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_mov_b32_e32 v7, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX7-NEXT: s_mov_b32 s11, -1
+; GFX7-NEXT: s_mov_b64 s[12:13], 0
+; GFX7-NEXT: .LBB17_2: ; %.lr.ph
+; GFX7-NEXT: ; =>This Loop Header: Depth=1
+; GFX7-NEXT: ; Child Loop BB17_3 Depth 2
+; GFX7-NEXT: v_mov_b32_e32 v8, 0
+; GFX7-NEXT: s_mov_b64 s[0:1], s[8:9]
+; GFX7-NEXT: .LBB17_3: ; %.preheader2
+; GFX7-NEXT: ; Parent Loop BB17_2 Depth=1
+; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX7-NEXT: buffer_load_sbyte v9, v[4:5], s[0:3], 0 addr64
+; GFX7-NEXT: buffer_load_sbyte v10, v[4:5], s[0:3], 0 addr64 offset:1
+; GFX7-NEXT: buffer_load_sbyte v11, v[4:5], s[0:3], 0 addr64 offset:2
+; GFX7-NEXT: buffer_load_sbyte v12, v[4:5], s[0:3], 0 addr64 offset:3
+; GFX7-NEXT: buffer_load_sbyte v13, v[4:5], s[0:3], 0 addr64 offset:4
+; GFX7-NEXT: buffer_load_sbyte v14, v[4:5], s[0:3], 0 addr64 offset:5
+; GFX7-NEXT: buffer_load_sbyte v15, v[4:5], s[0:3], 0 addr64 offset:6
+; GFX7-NEXT: buffer_load_sbyte v16, v[4:5], s[0:3], 0 addr64 offset:7
+; GFX7-NEXT: buffer_load_sbyte v17, v[4:5], s[0:3], 0 addr64 offset:8
+; GFX7-NEXT: buffer_load_sbyte v18, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: buffer_load_sbyte v19, v[0:1], s[0:3], 0 addr64 offset:1
+; GFX7-NEXT: buffer_load_sbyte v20, v[0:1], s[0:3], 0 addr64 offset:2
+; GFX7-NEXT: buffer_load_sbyte v21, v[0:1], s[0:3], 0 addr64 offset:3
+; GFX7-NEXT: buffer_load_sbyte v22, v[0:1], s[0:3], 0 addr64 offset:4
+; GFX7-NEXT: buffer_load_sbyte v23, v[0:1], s[0:3], 0 addr64 offset:5
+; GFX7-NEXT: buffer_load_sbyte v24, v[0:1], s[0:3], 0 addr64 offset:6
+; GFX7-NEXT: buffer_load_sbyte v25, v[0:1], s[0:3], 0 addr64 offset:7
+; GFX7-NEXT: buffer_load_sbyte v26, v[0:1], s[0:3], 0 addr64 offset:8
+; GFX7-NEXT: s_add_u32 s0, s0, 9
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, s[0:1], v[6:7]
+; GFX7-NEXT: s_and_b64 vcc, exec, vcc
+; GFX7-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NEXT: v_mad_i32_i24 v8, v18, v9, v8
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: v_mad_i32_i24 v8, v19, v10, v8
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_mad_i32_i24 v8, v20, v11, v8
+; GFX7-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NEXT: v_mad_i32_i24 v8, v21, v12, v8
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_mad_i32_i24 v8, v22, v13, v8
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_mad_i32_i24 v8, v23, v14, v8
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mad_i32_i24 v8, v24, v15, v8
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mad_i32_i24 v8, v25, v16, v8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mad_i32_i24 v8, v26, v17, v8
+; GFX7-NEXT: s_cbranch_vccnz .LBB17_3
+; GFX7-NEXT: ; %bb.4: ; %.110
+; GFX7-NEXT: ; in Loop: Header=BB17_2 Depth=1
+; GFX7-NEXT: v_lshl_b64 v[9:10], v[2:3], 2
+; GFX7-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], v[2:3]
+; GFX7-NEXT: buffer_store_dword v8, v[9:10], s[4:7], 0 addr64
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v2
+; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x900, v4
+; GFX7-NEXT: v_mov_b32_e32 v2, v8
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GFX7-NEXT: s_or_b64 s[12:13], s[0:1], s[12:13]
+; GFX7-NEXT: v_mov_b32_e32 v3, v9
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[12:13]
+; GFX7-NEXT: s_cbranch_execnz .LBB17_2
+; GFX7-NEXT: .LBB17_5: ; %._crit_edge
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: ByteOffsetCorrectness:
+; GFX8: ; %bb.0: ; %.entry
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_cbranch_execz .LBB17_5
+; GFX8-NEXT: ; %bb.1: ; %.lr.ph.preheader
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v3, v2
+; GFX8-NEXT: s_movk_i32 s0, 0x900
+; GFX8-NEXT: v_mul_hi_u32_u24_e32 v4, 0x900, v3
+; GFX8-NEXT: v_mul_u32_u24_e32 v3, 0x900, v3
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v2, s0, v[3:4]
+; GFX8-NEXT: s_movk_i32 s0, 0x48
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, s0, v[1:2]
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 5, v5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
+; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], 0, 0, vcc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s6, v2
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v4, v3, vcc
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v2
+; GFX8-NEXT: s_movk_i32 s4, 0xffe1
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v4, v3, vcc
+; GFX8-NEXT: s_mov_b32 s5, -1
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: .LBB17_2: ; %.lr.ph
+; GFX8-NEXT: ; =>This Loop Header: Depth=1
+; GFX8-NEXT: ; Child Loop BB17_3 Depth 2
+; GFX8-NEXT: v_mov_b32_e32 v10, 0
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB17_3: ; %.preheader2
+; GFX8-NEXT: ; Parent Loop BB17_2 Depth=1
+; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v8
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v9, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v6
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v7, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v11, v[4:5]
+; GFX8-NEXT: flat_load_sbyte v12, v[2:3]
+; GFX8-NEXT: s_add_u32 s0, s0, 9
+; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0x48
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v12, v11, v10
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 2, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 2, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 3, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 3, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 4, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 4, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 5, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 5, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v12, v10, v13, v12
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 7, v4
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_sbyte v13, v[10:11]
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 7, v2
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 8, v4
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v2
+; GFX8-NEXT: flat_load_sbyte v10, v[10:11]
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_sbyte v4, v[4:5]
+; GFX8-NEXT: flat_load_sbyte v2, v[2:3]
+; GFX8-NEXT: s_waitcnt vmcnt(2)
+; GFX8-NEXT: v_mad_i32_i24 v10, v10, v13, v12
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_i32_i24 v10, v2, v4, v10
+; GFX8-NEXT: s_cbranch_scc1 .LBB17_3
+; GFX8-NEXT: ; %bb.4: ; %.110
+; GFX8-NEXT: ; in Loop: Header=BB17_2 Depth=1
+; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v4, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
+; GFX8-NEXT: flat_store_dword v[2:3], v10
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x900, v6
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x900, v8
+; GFX8-NEXT: v_mov_b32_e32 v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GFX8-NEXT: s_or_b64 s[6:7], s[0:1], s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v1, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB17_2
+; GFX8-NEXT: .LBB17_5: ; %._crit_edge
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-NODL-LABEL: ByteOffsetCorrectness:
+; GFX9-NODL: ; %bb.0: ; %.entry
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NODL-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1]
+; GFX9-NODL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NODL-NEXT: s_cbranch_execz .LBB17_5
+; GFX9-NODL-NEXT: ; %bb.1: ; %.lr.ph.preheader
+; GFX9-NODL-NEXT: v_add_u32_e32 v10, v3, v2
+; GFX9-NODL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v1, 5, v10
+; GFX9-NODL-NEXT: s_movk_i32 s3, 0x900
+; GFX9-NODL-NEXT: v_mul_hi_u32_u24_e32 v9, 0x900, v2
+; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v8, 0x900, v2
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v4, vcc, v1, v0
+; GFX9-NODL-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v3, s3, v[8:9]
+; GFX9-NODL-NEXT: v_mul_hi_u32_u24_e32 v7, 0x48, v0
+; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v6, 0x48, v0
+; GFX9-NODL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, s3, v[6:7]
+; GFX9-NODL-NEXT: s_movk_i32 s2, 0x48
+; GFX9-NODL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, s2, v[1:2]
+; GFX9-NODL-NEXT: v_addc_co_u32_e64 v5, s[0:1], 0, 0, vcc
+; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT: v_mov_b32_e32 v8, s9
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v0, vcc, s8, v6
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v7, vcc
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
+; GFX9-NODL-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v6, vcc, s10, v6
+; GFX9-NODL-NEXT: s_movk_i32 s6, 0xffe1
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v7, vcc
+; GFX9-NODL-NEXT: s_mov_b32 s7, -1
+; GFX9-NODL-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NODL-NEXT: .LBB17_2: ; %.lr.ph
+; GFX9-NODL-NEXT: ; =>This Loop Header: Depth=1
+; GFX9-NODL-NEXT: ; Child Loop BB17_3 Depth 2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-NODL-NEXT: s_mov_b64 s[10:11], 0
+; GFX9-NODL-NEXT: .LBB17_3: ; %.preheader2
+; GFX9-NODL-NEXT: ; Parent Loop BB17_2 Depth=1
+; GFX9-NODL-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v12, s11
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v9, vcc, s10, v6
+; GFX9-NODL-NEXT: v_add_co_u32_e64 v11, s[0:1], s10, v0
+; GFX9-NODL-NEXT: v_add_co_u32_e64 v13, s[2:3], s10, v2
+; GFX9-NODL-NEXT: v_addc_co_u32_e64 v14, s[2:3], v3, v12, s[2:3]
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v10, vcc, v7, v12, vcc
+; GFX9-NODL-NEXT: v_addc_co_u32_e64 v12, vcc, v1, v12, s[0:1]
+; GFX9-NODL-NEXT: global_load_sbyte v15, v[13:14], off
+; GFX9-NODL-NEXT: global_load_sbyte v16, v[11:12], off offset:1
+; GFX9-NODL-NEXT: global_load_sbyte v17, v[11:12], off offset:2
+; GFX9-NODL-NEXT: global_load_sbyte v18, v[11:12], off offset:3
+; GFX9-NODL-NEXT: global_load_sbyte v19, v[11:12], off offset:4
+; GFX9-NODL-NEXT: global_load_sbyte v20, v[11:12], off offset:5
+; GFX9-NODL-NEXT: global_load_sbyte v21, v[11:12], off offset:6
+; GFX9-NODL-NEXT: global_load_sbyte v22, v[11:12], off offset:7
+; GFX9-NODL-NEXT: global_load_sbyte v23, v[9:10], off
+; GFX9-NODL-NEXT: global_load_sbyte v24, v[9:10], off offset:1
+; GFX9-NODL-NEXT: global_load_sbyte v25, v[9:10], off offset:2
+; GFX9-NODL-NEXT: global_load_sbyte v26, v[9:10], off offset:3
+; GFX9-NODL-NEXT: global_load_sbyte v27, v[9:10], off offset:4
+; GFX9-NODL-NEXT: global_load_sbyte v28, v[9:10], off offset:5
+; GFX9-NODL-NEXT: global_load_sbyte v29, v[9:10], off offset:6
+; GFX9-NODL-NEXT: ; kill: killed $vgpr11 killed $vgpr12
+; GFX9-NODL-NEXT: global_load_sbyte v11, v[9:10], off offset:7
+; GFX9-NODL-NEXT: global_load_sbyte v12, v[13:14], off offset:8
+; GFX9-NODL-NEXT: global_load_sbyte v30, v[9:10], off offset:8
+; GFX9-NODL-NEXT: s_add_u32 s10, s10, 9
+; GFX9-NODL-NEXT: s_addc_u32 s11, s11, 0
+; GFX9-NODL-NEXT: s_cmp_lg_u64 s[10:11], 0x48
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(9)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v23, v15, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(8)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v24, v16, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(7)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v25, v17, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v26, v18, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(5)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v27, v19, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v28, v20, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v29, v21, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v11, v22, v8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT: v_mad_i32_i24 v8, v30, v12, v8
+; GFX9-NODL-NEXT: s_cbranch_scc1 .LBB17_3
+; GFX9-NODL-NEXT: ; %bb.4: ; %.110
+; GFX9-NODL-NEXT: ; in Loop: Header=BB17_2 Depth=1
+; GFX9-NODL-NEXT: v_lshlrev_b64 v[9:10], 2, v[4:5]
+; GFX9-NODL-NEXT: v_mov_b32_e32 v11, s5
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v9, vcc, s4, v9
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v10, vcc, v11, v10, vcc
+; GFX9-NODL-NEXT: global_store_dword v[9:10], v8, off
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v8, vcc, 32, v4
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v0, vcc, 0x900, v0
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v2, vcc, 0x900, v2
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NODL-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], v[4:5]
+; GFX9-NODL-NEXT: v_add_co_u32_e32 v6, vcc, 0x900, v6
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, v8
+; GFX9-NODL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX9-NODL-NEXT: s_or_b64 s[8:9], s[0:1], s[8:9]
+; GFX9-NODL-NEXT: v_mov_b32_e32 v5, v9
+; GFX9-NODL-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX9-NODL-NEXT: s_cbranch_execnz .LBB17_2
+; GFX9-NODL-NEXT: .LBB17_5: ; %._crit_edge
+; GFX9-NODL-NEXT: s_endpgm
+;
+; GFX9-DL-LABEL: ByteOffsetCorrectness:
+; GFX9-DL: ; %bb.0: ; %.entry
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DL-NEXT: v_cmp_gt_i64_e32 vcc, 2, v[0:1]
+; GFX9-DL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DL-NEXT: s_cbranch_execz .LBB17_5
+; GFX9-DL-NEXT: ; %bb.1: ; %.lr.ph.preheader
+; GFX9-DL-NEXT: v_add_u32_e32 v10, v3, v2
+; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x24
+; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX9-DL-NEXT: v_lshlrev_b32_e32 v1, 5, v10
+; GFX9-DL-NEXT: s_movk_i32 s3, 0x900
+; GFX9-DL-NEXT: v_mul_hi_u32_u24_e32 v9, 0x900, v2
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, 0x900, v2
+; GFX9-DL-NEXT: v_add_co_u32_e32 v4, vcc, v1, v0
+; GFX9-DL-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v3, s3, v[8:9]
+; GFX9-DL-NEXT: v_mul_hi_u32_u24_e32 v7, 0x48, v0
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v6, 0x48, v0
+; GFX9-DL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, s3, v[6:7]
+; GFX9-DL-NEXT: s_movk_i32 s2, 0x48
+; GFX9-DL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, s2, v[1:2]
+; GFX9-DL-NEXT: v_addc_co_u32_e64 v5, s[0:1], 0, 0, vcc
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9
+; GFX9-DL-NEXT: v_add_co_u32_e32 v0, vcc, s8, v6
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v7, vcc
+; GFX9-DL-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-DL-NEXT: v_add_co_u32_e32 v6, vcc, s10, v6
+; GFX9-DL-NEXT: s_movk_i32 s8, 0xffe1
+; GFX9-DL-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v7, vcc
+; GFX9-DL-NEXT: s_mov_b64 s[6:7], 0
+; GFX9-DL-NEXT: s_mov_b32 s12, 0xc0c0400
+; GFX9-DL-NEXT: s_mov_b32 s9, -1
+; GFX9-DL-NEXT: s_mov_b32 s13, 0x4000c0c
+; GFX9-DL-NEXT: .LBB17_2: ; %.lr.ph
+; GFX9-DL-NEXT: ; =>This Loop Header: Depth=1
+; GFX9-DL-NEXT: ; Child Loop BB17_3 Depth 2
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-DL-NEXT: s_mov_b64 s[10:11], 0
+; GFX9-DL-NEXT: .L...
[truncated]
|
llvm/test/CodeGen/AMDGPU/idot4s.ll
Outdated
| } | ||
|
|
||
|
|
||
| define amdgpu_kernel void @ByteOffsetCorrectness(ptr addrspace(1) noalias readonly align 16 %inptr0, ptr addrspace(1) noalias readonly align 16 %inptr1, ptr addrspace(1) noalias align 16 %inptr2) local_unnamed_addr { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comment the test. Can this be shrunk?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The behavior is actually a bit difficult to reproduce with a small concise example -- optimizations / DAG building consistently transform the code in a way which hides the issue.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But what transformations? You can usually massage the test to defeat them. e.g. use volatile loads.
If you avoid using the work item ID intrinsics, you can avoid more transforms (like you're bypassing the annotation that will form assert-zexts)
Change-Id: I70cc33b3e3af22d276ede907d3cbf9a2132f6ce4
Change-Id: I670d272205b5431a1fc434abd94550747c49c15e
Change-Id: If26584f3e25c5a1e4ec33ca71ac1d331eae24103
Co-authored-by: Matt Arsenault <[email protected]>
Change-Id: I12db92842348a45b8c6d6d6914120427ae1b62a6
|
Should add the fixes issue number to the description |
|
Description should also be more clear this is a bug fix |
Change-Id: I243db030171d8997d227378dd3bbc9147aa56bdb
Fixes a copy-paste typo.
The typo resulted in producing bad v_perm based operands for the v_dot4 combine. When adding a corresponding byte pair to the v_dot byte pair chains, we must take note of the byte position in the corresponding source nodes. These byte positions are used to ensure we extract the correct DWord from the ultimate source, and formulate a correct perm_mask from the extracted DWord.
With the typo, we the S0 byte would used the DWord offset for the corresponding S1 byte. If this offset was not the same as the true DWord offset for the S0 byte, we would extract and use the wrong byte for S0 in the v_dot.
Fixes #112941