Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2926,6 +2926,14 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags()))
return RADD;

// (X + Y) + X --> Y + (X + X)
SDValue X, Y;
if (sd_match(N, m_AddLike(m_OneUse(m_AddLike(m_Value(X), m_Value(Y))),
m_Deferred(X))))
if (X != Y)
return DAG.getNode(ISD::ADD, DL, VT, Y,
DAG.getNode(ISD::ADD, DL, VT, X, X));

// Reassociate (add (or x, c), y) -> (add add(x, y), c)) if (or x, c) is
// equivalent to (add x, c).
// Reassociate (add (xor x, c), y) -> (add add(x, y), c)) if (xor x, c) is
Expand Down
5 changes: 1 addition & 4 deletions llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,7 @@
define i64 @g(ptr %p) {
; CHECK-LABEL: g:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr x8, [x0, #8]
; CHECK-NEXT: add x9, x8, x8
; CHECK-NEXT: add x8, x9, x8
; CHECK-NEXT: sub x0, x8, x8
; CHECK-NEXT: mov x0, xzr
; CHECK-NEXT: ret
%vec = load <2 x i64>, ptr %p, align 1
%elt = extractelement <2 x i64> %vec, i32 1
Expand Down
122 changes: 71 additions & 51 deletions llvm/test/CodeGen/AMDGPU/idot2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1717,11 +1717,11 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
Expand All @@ -1748,8 +1748,8 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
Expand All @@ -1765,13 +1765,16 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v2
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1
; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v1
; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v4, v3, v1
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
;
Expand All @@ -1785,13 +1788,16 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xffff, v2
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1
; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v1
; GFX9-DL-NEXT: v_mad_u32_u24 v1, v4, v3, v1
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
Expand All @@ -1811,12 +1817,14 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0
; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v0, v2, v1, v0
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jayfoad @arsenm It looks like we're missing demanded bits handling for MAD24 instructions - but I haven't found much in the DAG that handles the MAD24 opcodes at all - is this all currently done with isel patterns? Is it going to cause problems if I try to add MAD24 DAG lowering?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have the code in front of me but I think it's mostly done in IR in AMDGPUCodeGenPrepare. We'd like to form mul24 when only the low 24 bits of a regular mul are demanded, but I don't think there's an easy way to implement a target-specific demanded bits optimization for a generic node like MUL.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we handle both AMDGPUCodeGenPrepare and a dag combine. We appear to be missing SimplifyDemandedBitsForTargetNode

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can fine plenty in CGP/DAG for MUL24 but not much for MAD24

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, the mad24 case I think is just a td pattern

; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
Expand Down Expand Up @@ -1873,8 +1881,8 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v1, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v0
; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
Expand All @@ -1901,8 +1909,8 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v0
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
Expand All @@ -1918,13 +1926,16 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 16
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_bfe_i32 v4, v2, 0, 16
; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1
; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v1
; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v4, v3, v1
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
;
Expand All @@ -1938,13 +1949,16 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 16
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 16
; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1
; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v1
; GFX9-DL-NEXT: v_mad_i32_i24 v1, v4, v3, v1
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
Expand All @@ -1964,12 +1978,14 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0
; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v0
; GFX10-DL-NEXT: v_mad_i32_i24 v0, v2, v1, v0
; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
Expand Down Expand Up @@ -2341,10 +2357,11 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_mul_u32_u24_e32 v4, v3, v1
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s0, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
Expand All @@ -2370,9 +2387,10 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s0
; GFX8-NEXT: v_mul_u32_u24_e32 v4, v0, v3
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -2394,9 +2412,9 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1
; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, v4
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-NODL-NEXT: v_add3_u32 v1, s0, v1, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
;
Expand All @@ -2415,9 +2433,9 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1
; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, v4
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-DL-NEXT: v_add3_u32 v1, s0, v1, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
Expand All @@ -2438,12 +2456,12 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v3, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, v4
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: v_add3_u32 v0, s0, v0, v1
; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
Expand Down Expand Up @@ -2499,9 +2517,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s0
; GFX7-NEXT: v_mul_i32_i24_e32 v4, v0, v2
; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
Expand All @@ -2527,9 +2546,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s0
; GFX8-NEXT: v_mul_i32_i24_e32 v4, v0, v3
; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -2551,9 +2571,9 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1
; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, v4
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-NODL-NEXT: v_add3_u32 v1, s0, v1, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
;
Expand All @@ -2572,9 +2592,9 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1
; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, v4
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-DL-NEXT: v_add3_u32 v1, s0, v1, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
Expand All @@ -2595,12 +2615,12 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_mul_i32_i24_e32 v4, v3, v0
; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, v4
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: v_add3_u32 v0, s0, v0, v1
; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/Hexagon/isel-fold-shl-zext.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,10 @@ define dso_local void @foo(i64* nocapture noundef %buf, i32 %a, i32 %b) local_un
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: {
; CHECK-NEXT: r2 = addasl(r2,r1,#1)
; CHECK-NEXT: r3 = asl(r1,#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r2 = addasl(r2,r1,#1)
; CHECK-NEXT: r2 += add(r3,r3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: jumpr r31
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/X86/avx-vinsertf128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,13 @@ define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly {
define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly {
; CHECK-LABEL: DAGCombineB:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; CHECK-NEXT: vpaddd %xmm3, %xmm1, %xmm1
; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%t1 = add <8 x i32> %v1, %v2
%t2 = add <8 x i32> %t1, %v1
Expand Down