Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2520,6 +2520,8 @@ def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">,
def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
AssemblerPredicate<(all_of FeatureAshrPkInsts)>;

def HasLShlAddB64 : Predicate<"Subtarget->hasLshlAddB64()">;

// Include AMDGPU TD files
include "SISchedule.td"
include "GCNProcessors.td"
Expand Down
13 changes: 12 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "llvm/ADT/ScopeExit.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
Expand Down Expand Up @@ -736,13 +737,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32);
} else {
getActionDefinitionsBuilder({G_ADD, G_SUB})
getActionDefinitionsBuilder(G_SUB)
.legalFor({S32, S16, V2S16})
.clampMaxNumElementsStrict(0, S16, 2)
.scalarize(0)
.minScalar(0, S16)
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32);

getActionDefinitionsBuilder(G_ADD)
.legalFor(ST.hasLshlAddB64()
? std::initializer_list<LLT>{S64, S32, S16, V2S16}
: std::initializer_list<LLT>{S32, S16, V2S16})
.clampMaxNumElementsStrict(0, S16, 2)
.scalarize(0)
.minScalar(0, S16)
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32);
}

if (ST.hasScalarSMulU64()) {
Expand Down
18 changes: 15 additions & 3 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -762,15 +762,27 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;

let SubtargetPredicate = HasLShlAddB64 in {
// TODO: Canonicalize these in the target specific CombinerHelper?
def : GCNPat<
(DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
(V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
(ptradd (shl i64:$src0, i32:$shift), i64:$src1),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does ptradd just get ignored by the DAG? Should this maintain the DivergentBinFrag predicate?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah the change is misaligned by git. Only 2 new patterns are added in this PR:

def : GCNPat<
  (ptradd (shl i64:$src0, i32:$shift), i64:$src1),
  (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$shift, VSrc_b64:$src1)
>;

def : GCNPat<
  (ptradd i64:$src0, (shl i64:$src1, i32:$shift)),
  (V_LSHL_ADD_U64_e64 VSrc_b64:$src1, VSrc_b32:$shift, VSrc_b64:$src0)
>;

(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$shift, VSrc_b64:$src1)
>;

def : GCNPat<
(ptradd i64:$src0, (shl i64:$src1, i32:$shift)),
(V_LSHL_ADD_U64_e64 VSrc_b64:$src1, VSrc_b32:$shift, VSrc_b64:$src0)
>;

let SubtargetPredicate = isGFX940Plus in
def : GCNPat<
(ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
>;
} // End SubtargetPredicate = HasLShlAddB64

def : GCNPat<
(DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This case doesn't look covered in the test?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is just simply a git mis-alignment. line 783-785 is actually moved 765-767.

(V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;

def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>;
Expand Down
228 changes: 189 additions & 39 deletions llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
Original file line number Diff line number Diff line change
@@ -1,108 +1,258 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -global-isel=1 < %s | FileCheck -check-prefix=GI %s

define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_v1v:
; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1, v[{{[0-9:]+}}]
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GI-LABEL: lshl_add_u64_v1v:
; GI: ; %bb.0:
; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3]
; GI-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, 1
%add = add i64 %shl, %a
ret i64 %add
}

define i64 @lshl_add_u64_v4v(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_v4v:
; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4, v[{{[0-9:]+}}]
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 4, v[2:3]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GI-LABEL: lshl_add_u64_v4v:
; GI: ; %bb.0:
; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 4, v[2:3]
; GI-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, 4
%add = add i64 %shl, %a
ret i64 %add
}

define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_v5v:
; GCN: v_lshlrev_b64
; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 5, v[2:3]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GI-LABEL: lshl_add_u64_v5v:
; GI: ; %bb.0:
; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GI-NEXT: v_lshlrev_b64 v[0:1], 5, v[0:1]
; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GI-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, 5
%add = add i64 %shl, %a
ret i64 %add
}

define i64 @lshl_add_u64_vvv(i64 %v, i64 %s, i64 %a) {
; GCN-LABEL: lshl_add_u64_vvv:
; GCN: v_lshlrev_b64
; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], v2, v[4:5]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GI-LABEL: lshl_add_u64_vvv:
; GI: ; %bb.0:
; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
; GI-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, %s
%add = add i64 %shl, %a
ret i64 %add
}

define amdgpu_kernel void @lshl_add_u64_s2v(i64 %v) {
define i64 @lshl_add_u64_s2v(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_s2v:
; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 2, v[{{[0-9:]+}}]
%a = load i64, ptr undef
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, v[2:3]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GI-LABEL: lshl_add_u64_s2v:
; GI: ; %bb.0:
; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, v[2:3]
; GI-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, 2
%add = add i64 %shl, %a
store i64 %add, ptr undef
ret void
ret i64 %add
}

define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) {
define i64 @lshl_add_u64_v2s(i64 %a, i64 %v) {
; GCN-LABEL: lshl_add_u64_v2s:
; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 2, s[{{[0-9:]+}}]
%v = load i64, ptr undef
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GI-LABEL: lshl_add_u64_v2s:
; GI: ; %bb.0:
; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GI-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
; GI-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, 2
%add = add i64 %shl, %a
store i64 %add, ptr undef
ret void
ret i64 %add
}

define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) {
define i64 @lshl_add_u64_s2s(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_s2s:
; GCN: s_lshl_b64
; GCN: s_add_u32
; GCN: s_addc_u32
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, v[2:3]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GI-LABEL: lshl_add_u64_s2s:
; GI: ; %bb.0:
; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, v[2:3]
; GI-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 %v, 2
%add = add i64 %shl, %a
store i64 %add, ptr undef
ret void
ret i64 %add
}

define i64 @add_u64_vv(i64 %v, i64 %a) {
; GCN-LABEL: add_u64_vv:
; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GI-LABEL: add_u64_vv:
; GI: ; %bb.0:
; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GI-NEXT: s_setpc_b64 s[30:31]
%add = add i64 %v, %a
ret i64 %add
}

define amdgpu_kernel void @add_u64_sv(i64 %v) {
define i64 @add_u64_sv(i64 %v, i64 %a) {
; GCN-LABEL: add_u64_sv:
; GCN: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
%a = load i64, ptr undef
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GI-LABEL: add_u64_sv:
; GI: ; %bb.0:
; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GI-NEXT: s_setpc_b64 s[30:31]
%add = add i64 %v, %a
store i64 %add, ptr undef
ret void
ret i64 %add
}

define amdgpu_kernel void @add_u64_vs(i64 %a) {
define i64 @add_u64_vs(i64 %a, i64 %v) {
; GCN-LABEL: add_u64_vs:
; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
%v = load i64, ptr undef
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GI-LABEL: add_u64_vs:
; GI: ; %bb.0:
; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GI-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
; GI-NEXT: s_setpc_b64 s[30:31]
%add = add i64 %v, %a
store i64 %add, ptr undef
ret void
ret i64 %add
}

define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
define i64 @add_u64_ss(i64 %v, i64 %a) {
; GCN-LABEL: add_u64_ss:
; GCN: s_add_u32
; GCN: s_addc_u32 s1, s1, s3
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GI-LABEL: add_u64_ss:
; GI: ; %bb.0:
; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
; GI-NEXT: s_setpc_b64 s[30:31]
%add = add i64 %v, %a
store i64 %add, ptr undef
ret void
ret i64 %add
}

define i32 @lshl_add_u64_gep(ptr %p, i64 %a) {
; GCN-LABEL: lshl_add_u64_gep:
; GCN: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
; GCN-NEXT: flat_load_dword v0, v[0:1]
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GI-LABEL: lshl_add_u64_gep:
; GI: ; %bb.0:
; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GI-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
; GI-NEXT: flat_load_dword v0, v[0:1]
; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GI-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i32, ptr %p, i64 %a
%v = load i32, ptr %gep
ret i32 %v
}

@arr = global [10 x [10 x i64]] zeroinitializer
define i64 @lshl_add_u64_gep_shift(i64 %row, i64 %col) {
; GCN-LABEL: lshl_add_u64_gep_shift:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_getpc_b64 s[0:1]
; GCN-NEXT: s_add_u32 s0, s0, arr@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s1, s1, arr@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GCN-NEXT: s_movk_i32 s2, 0x50
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GCN-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, v5
; GCN-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, s2, v[0:1]
; GCN-NEXT: v_mov_b32_e32 v5, v0
; GCN-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[4:5]
; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GI-LABEL: lshl_add_u64_gep_shift:
; GI: ; %bb.0: ; %entry
; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GI-NEXT: s_getpc_b64 s[0:1]
; GI-NEXT: s_add_u32 s0, s0, arr@gotpcrel32@lo+4
; GI-NEXT: s_addc_u32 s1, s1, arr@gotpcrel32@hi+12
; GI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GI-NEXT: v_mov_b32_e32 v6, 0x50
; GI-NEXT: v_mad_u64_u32 v[4:5], s[2:3], v0, v6, 0
; GI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v1, v6, 0
; GI-NEXT: v_add_u32_e32 v5, v5, v0
; GI-NEXT: s_waitcnt lgkmcnt(0)
; GI-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GI-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
; GI-NEXT: s_nop 1
; GI-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
; GI-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
; GI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GI-NEXT: s_setpc_b64 s[30:31]
entry:
%base = getelementptr [10 x [10 x i64]], ptr @arr, i64 0, i64 %row, i64 0
%shifted_col = shl i64 %col, 2 ; multiply by sizeof(i64) (shift left by 2)
%ptr = getelementptr i8, ptr %base, i64 %shifted_col
%val = load i64, ptr %ptr
ret i64 %val
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test cases with SGPR inputs. Also a few vector of pointers (and addrspace(1)) ptrs. I'm not sure if they will work as-is

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added SGPR input tests, and vector pointer tests.