From 9849ed9142f1231a1904667beb2226bf8d3d9e84 Mon Sep 17 00:00:00 2001 From: Tres Popp Date: Tue, 28 Jan 2025 06:13:44 -0800 Subject: [PATCH 1/4] AMDGPU GlobalISel G_ADD and G_PTR_ADD 64 support This considers hasLshlAddB64 support and adds patterns for ptradd. --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 18 +++- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 18 ++++ llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll | 87 +++++++++++++++---- 3 files changed, 105 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 740e52fb87dc2..dcf7a0777178d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -736,13 +736,29 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextMultipleOf(0, 32) .maxScalar(0, S32); } else { - getActionDefinitionsBuilder({G_ADD, G_SUB}) + getActionDefinitionsBuilder(G_SUB) .legalFor({S32, S16, V2S16}) .clampMaxNumElementsStrict(0, S16, 2) .scalarize(0) .minScalar(0, S16) .widenScalarToNextMultipleOf(0, 32) .maxScalar(0, S32); + if (ST.hasLshlAddB64()) + getActionDefinitionsBuilder(G_ADD) + .legalFor({S64, S32, S16, V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .minScalar(0, S16) + .widenScalarToNextMultipleOf(0, 32) + .maxScalar(0, S32); + else + getActionDefinitionsBuilder(G_ADD) + .legalFor({S32, S16, V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .minScalar(0, S16) + .widenScalarToNextMultipleOf(0, 32) + .maxScalar(0, S32); } if (ST.hasScalarSMulU64()) { diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 1447804871809..9108760a1f6c7 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -762,6 +762,24 @@ def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; +let SubtargetPredicate = isGFX940Plus in { +// TODO: Canonicalize these in the target specific CombinerHelper? +def : GCNPat< + (ptradd (shl i64:$src0, i32:$shift), i64:$src1), + (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$shift, VSrc_b64:$src1) +>; + +def : GCNPat< + (ptradd i64:$src0, (shl i64:$src1, i32:$shift)), + (V_LSHL_ADD_U64_e64 VSrc_b64:$src1, VSrc_b32:$shift, VSrc_b64:$src0) +>; + +def : GCNPat< + (ptradd i64:$src0, i64:$src1), + (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, (i32 0), VSrc_b64:$src1) +>; +} + def : GCNPat< (DivergentBinFrag i32:$src0, IsPow2Plus1:$src1), (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>; diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll index 4262cc44a6e74..a7ccaf79ecae6 100644 --- a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll +++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll @@ -2,7 +2,10 @@ define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) { ; GCN-LABEL: lshl_add_u64_v1v: -; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1, v[{{[0-9:]+}}] +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3] +; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %v, 1 %add = add i64 %shl, %a ret i64 %add @@ -10,7 +13,10 @@ define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) { define i64 @lshl_add_u64_v4v(i64 %v, i64 %a) { ; GCN-LABEL: lshl_add_u64_v4v: -; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4, v[{{[0-9:]+}}] +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 4, v[2:3] +; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %v, 4 %add = add i64 %shl, %a ret i64 %add @@ -18,8 +24,10 @@ define i64 @lshl_add_u64_v4v(i64 %v, i64 %a) { define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) { ; GCN-LABEL: lshl_add_u64_v5v: -; GCN: v_lshlrev_b64 -; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}] +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 5, v[2:3] +; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %v, 5 %add = add i64 %shl, %a ret i64 %add @@ -27,8 +35,10 @@ define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) { define i64 @lshl_add_u64_vvv(i64 %v, i64 %s, i64 %a) { ; GCN-LABEL: lshl_add_u64_vvv: -; GCN: v_lshlrev_b64 -; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}] +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], v2, v[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %v, %s %add = add i64 %shl, %a ret i64 %add @@ -36,7 +46,13 @@ define i64 @lshl_add_u64_vvv(i64 %v, i64 %s, i64 %a) { define amdgpu_kernel void @lshl_add_u64_s2v(i64 %v) { ; GCN-LABEL: lshl_add_u64_s2v: -; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 2, v[{{[0-9:]+}}] +; GCN: ; %bb.0: +; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 2, v[0:1] +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GCN-NEXT: s_endpgm %a = load i64, ptr undef %shl = shl i64 %v, 2 %add = add i64 %shl, %a @@ -46,7 +62,13 @@ define amdgpu_kernel void @lshl_add_u64_s2v(i64 %v) { define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) { ; GCN-LABEL: lshl_add_u64_v2s: -; GCN: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 2, s[{{[0-9:]+}}] +; GCN: ; %bb.0: +; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[0:1] +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GCN-NEXT: s_endpgm %v = load i64, ptr undef %shl = shl i64 %v, 2 %add = add i64 %shl, %a @@ -56,9 +78,14 @@ define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) { define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) { ; GCN-LABEL: lshl_add_u64_s2s: -; GCN: s_lshl_b64 -; GCN: s_add_u32 -; GCN: s_addc_u32 +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 2, v[0:1] +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GCN-NEXT: s_endpgm %shl = shl i64 %v, 2 %add = add i64 %shl, %a store i64 %add, ptr undef @@ -67,14 +94,23 @@ define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) { define i64 @add_u64_vv(i64 %v, i64 %a) { ; GCN-LABEL: add_u64_vv: -; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GCN-NEXT: s_setpc_b64 s[30:31] %add = add i64 %v, %a ret i64 %add } define amdgpu_kernel void @add_u64_sv(i64 %v) { ; GCN-LABEL: add_u64_sv: -; GCN: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GCN: ; %bb.0: +; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GCN-NEXT: s_endpgm %a = load i64, ptr undef %add = add i64 %v, %a store i64 %add, ptr undef @@ -83,7 +119,13 @@ define amdgpu_kernel void @add_u64_sv(i64 %v) { define amdgpu_kernel void @add_u64_vs(i64 %a) { ; GCN-LABEL: add_u64_vs: -; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GCN: ; %bb.0: +; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GCN-NEXT: s_endpgm %v = load i64, ptr undef %add = add i64 %v, %a store i64 %add, ptr undef @@ -92,8 +134,14 @@ define amdgpu_kernel void @add_u64_vs(i64 %a) { define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) { ; GCN-LABEL: add_u64_ss: -; GCN: s_add_u32 -; GCN: s_addc_u32 s1, s1, s3 +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_u32 s0, s0, s2 +; GCN-NEXT: s_addc_u32 s1, s1, s3 +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 +; GCN-NEXT: s_endpgm %add = add i64 %v, %a store i64 %add, ptr undef ret void @@ -101,7 +149,12 @@ define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) { define i32 @lshl_add_u64_gep(ptr %p, i64 %a) { ; GCN-LABEL: lshl_add_u64_gep: -; GCN: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GCN-NEXT: flat_load_dword v0, v[0:1] +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i32, ptr %p, i64 %a %v = load i32, ptr %gep ret i32 %v From eb6f6d857bab0acb43d841d0e6b3259ad7ffd3bc Mon Sep 17 00:00:00 2001 From: Alan Li Date: Thu, 27 Feb 2025 09:01:00 -0500 Subject: [PATCH 2/4] Update according to comments Signed-off-by: Alan Li --- llvm/lib/Target/AMDGPU/AMDGPU.td | 2 ++ .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 27 ++++++++----------- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 13 +++++---- 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index effc8d2ed6b49..35b6b830e332c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2520,6 +2520,8 @@ def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">, def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">, AssemblerPredicate<(all_of FeatureAshrPkInsts)>; +def HasLShlAddB64 : Predicate<"Subtarget->hasLshlAddB64()">; + // Include AMDGPU TD files include "SISchedule.td" include "GCNProcessors.td" diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index dcf7a0777178d..73afd70d87685 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/ScopeExit.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" @@ -743,22 +744,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .minScalar(0, S16) .widenScalarToNextMultipleOf(0, 32) .maxScalar(0, S32); - if (ST.hasLshlAddB64()) - getActionDefinitionsBuilder(G_ADD) - .legalFor({S64, S32, S16, V2S16}) - .clampMaxNumElementsStrict(0, S16, 2) - .scalarize(0) - .minScalar(0, S16) - .widenScalarToNextMultipleOf(0, 32) - .maxScalar(0, S32); - else - getActionDefinitionsBuilder(G_ADD) - .legalFor({S32, S16, V2S16}) - .clampMaxNumElementsStrict(0, S16, 2) - .scalarize(0) - .minScalar(0, S16) - .widenScalarToNextMultipleOf(0, 32) - .maxScalar(0, S32); + + getActionDefinitionsBuilder(G_ADD) + .legalFor(ST.hasLshlAddB64() + ? std::initializer_list{S32, S16, V2S16, S64} + : std::initializer_list{S32, S16, V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .minScalar(0, S16) + .widenScalarToNextMultipleOf(0, 32) + .maxScalar(0, S32); } if (ST.hasScalarSMulU64()) { diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 9108760a1f6c7..e9a9ffda831cc 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -762,7 +762,7 @@ def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; -let SubtargetPredicate = isGFX940Plus in { +let SubtargetPredicate = HasLShlAddB64 in { // TODO: Canonicalize these in the target specific CombinerHelper? def : GCNPat< (ptradd (shl i64:$src0, i32:$shift), i64:$src1), @@ -778,17 +778,16 @@ def : GCNPat< (ptradd i64:$src0, i64:$src1), (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, (i32 0), VSrc_b64:$src1) >; -} -def : GCNPat< - (DivergentBinFrag i32:$src0, IsPow2Plus1:$src1), - (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>; - -let SubtargetPredicate = isGFX940Plus in def : GCNPat< (ThreeOpFrag i64:$src0, i32:$src1, i64:$src2), (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) >; +} // End SubtargetPredicate = HasLShlAddB64 + +def : GCNPat< + (DivergentBinFrag i32:$src0, IsPow2Plus1:$src1), + (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>; def : VOPBinOpClampPat; def : VOPBinOpClampPat; From ed14825264950e870793702d0c1747b6912ae54e Mon Sep 17 00:00:00 2001 From: Alan Li Date: Fri, 28 Feb 2025 09:42:52 -0500 Subject: [PATCH 3/4] update test cases --- llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll | 168 +++++++++++++++-------- 1 file changed, 108 insertions(+), 60 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll index a7ccaf79ecae6..e477c941e7a07 100644 --- a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll +++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll @@ -1,4 +1,6 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -global-isel=1 < %s | FileCheck -check-prefix=GI %s define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) { ; GCN-LABEL: lshl_add_u64_v1v: @@ -6,6 +8,12 @@ define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GI-LABEL: lshl_add_u64_v1v: +; GI: ; %bb.0: +; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3] +; GI-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %v, 1 %add = add i64 %shl, %a ret i64 %add @@ -17,6 +25,12 @@ define i64 @lshl_add_u64_v4v(i64 %v, i64 %a) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 4, v[2:3] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GI-LABEL: lshl_add_u64_v4v: +; GI: ; %bb.0: +; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 4, v[2:3] +; GI-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %v, 4 %add = add i64 %shl, %a ret i64 %add @@ -28,6 +42,13 @@ define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 5, v[2:3] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GI-LABEL: lshl_add_u64_v5v: +; GI: ; %bb.0: +; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GI-NEXT: v_lshlrev_b64 v[0:1], 5, v[0:1] +; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GI-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %v, 5 %add = add i64 %shl, %a ret i64 %add @@ -39,57 +60,67 @@ define i64 @lshl_add_u64_vvv(i64 %v, i64 %s, i64 %a) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], v2, v[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GI-LABEL: lshl_add_u64_vvv: +; GI: ; %bb.0: +; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5] +; GI-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %v, %s %add = add i64 %shl, %a ret i64 %add } -define amdgpu_kernel void @lshl_add_u64_s2v(i64 %v) { +define i64 @lshl_add_u64_s2v(i64 %v, i64 %a) { ; GCN-LABEL: lshl_add_u64_s2v: ; GCN: ; %bb.0: -; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 2, v[0:1] -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GCN-NEXT: s_endpgm - %a = load i64, ptr undef +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, v[2:3] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GI-LABEL: lshl_add_u64_s2v: +; GI: ; %bb.0: +; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, v[2:3] +; GI-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %v, 2 %add = add i64 %shl, %a - store i64 %add, ptr undef - ret void + ret i64 %add } -define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) { +define i64 @lshl_add_u64_v2s(i64 %a, i64 %v) { ; GCN-LABEL: lshl_add_u64_v2s: ; GCN: ; %bb.0: -; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[0:1] -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GCN-NEXT: s_endpgm - %v = load i64, ptr undef +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GI-LABEL: lshl_add_u64_v2s: +; GI: ; %bb.0: +; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GI-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GI-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %v, 2 %add = add i64 %shl, %a - store i64 %add, ptr undef - ret void + ret i64 %add } -define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) { +define i64 @lshl_add_u64_s2s(i64 %v, i64 %a) { ; GCN-LABEL: lshl_add_u64_s2s: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 2, v[0:1] -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, v[2:3] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GI-LABEL: lshl_add_u64_s2s: +; GI: ; %bb.0: +; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, v[2:3] +; GI-NEXT: s_setpc_b64 s[30:31] %shl = shl i64 %v, 2 %add = add i64 %shl, %a - store i64 %add, ptr undef - ret void + ret i64 %add } define i64 @add_u64_vv(i64 %v, i64 %a) { @@ -98,53 +129,62 @@ define i64 @add_u64_vv(i64 %v, i64 %a) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GI-LABEL: add_u64_vv: +; GI: ; %bb.0: +; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GI-NEXT: s_setpc_b64 s[30:31] %add = add i64 %v, %a ret i64 %add } -define amdgpu_kernel void @add_u64_sv(i64 %v) { +define i64 @add_u64_sv(i64 %v, i64 %a) { ; GCN-LABEL: add_u64_sv: ; GCN: ; %bb.0: -; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GCN-NEXT: s_endpgm - %a = load i64, ptr undef +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GI-LABEL: add_u64_sv: +; GI: ; %bb.0: +; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GI-NEXT: s_setpc_b64 s[30:31] %add = add i64 %v, %a - store i64 %add, ptr undef - ret void + ret i64 %add } -define amdgpu_kernel void @add_u64_vs(i64 %a) { +define i64 @add_u64_vs(i64 %a, i64 %v) { ; GCN-LABEL: add_u64_vs: ; GCN: ; %bb.0: -; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GCN-NEXT: s_endpgm - %v = load i64, ptr undef +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GI-LABEL: add_u64_vs: +; GI: ; %bb.0: +; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GI-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GI-NEXT: s_setpc_b64 s[30:31] %add = add i64 %v, %a - store i64 %add, ptr undef - ret void + ret i64 %add } -define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) { +define i64 @add_u64_ss(i64 %v, i64 %a) { ; GCN-LABEL: add_u64_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_u32 s0, s0, s2 -; GCN-NEXT: s_addc_u32 s1, s1, s3 -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GI-LABEL: add_u64_ss: +; GI: ; %bb.0: +; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GI-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GI-NEXT: s_setpc_b64 s[30:31] %add = add i64 %v, %a - store i64 %add, ptr undef - ret void + ret i64 %add } define i32 @lshl_add_u64_gep(ptr %p, i64 %a) { @@ -155,6 +195,14 @@ define i32 @lshl_add_u64_gep(ptr %p, i64 %a) { ; GCN-NEXT: flat_load_dword v0, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GI-LABEL: lshl_add_u64_gep: +; GI: ; %bb.0: +; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GI-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GI-NEXT: flat_load_dword v0, v[0:1] +; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GI-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i32, ptr %p, i64 %a %v = load i32, ptr %gep ret i32 %v From 5eea98cfe64fbab0753f49c52ca4f3facb47ed96 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Fri, 28 Feb 2025 14:02:25 -0500 Subject: [PATCH 4/4] remove and add --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 2 +- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 5 -- llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll | 49 +++++++++++++++++++ 3 files changed, 50 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 73afd70d87685..43b4dafe6a068 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -747,7 +747,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_ADD) .legalFor(ST.hasLshlAddB64() - ? std::initializer_list{S32, S16, V2S16, S64} + ? std::initializer_list{S64, S32, S16, V2S16} : std::initializer_list{S32, S16, V2S16}) .clampMaxNumElementsStrict(0, S16, 2) .scalarize(0) diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index e9a9ffda831cc..13f61a608aa1c 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -774,11 +774,6 @@ def : GCNPat< (V_LSHL_ADD_U64_e64 VSrc_b64:$src1, VSrc_b32:$shift, VSrc_b64:$src0) >; -def : GCNPat< - (ptradd i64:$src0, i64:$src1), - (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, (i32 0), VSrc_b64:$src1) ->; - def : GCNPat< (ThreeOpFrag i64:$src0, i32:$src1, i64:$src2), (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll index e477c941e7a07..f1fa30cbc639d 100644 --- a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll +++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll @@ -207,3 +207,52 @@ define i32 @lshl_add_u64_gep(ptr %p, i64 %a) { %v = load i32, ptr %gep ret i32 %v } + +@arr = global [10 x [10 x i64]] zeroinitializer +define i64 @lshl_add_u64_gep_shift(i64 %row, i64 %col) { +; GCN-LABEL: lshl_add_u64_gep_shift: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: s_add_u32 s0, s0, arr@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s1, s1, arr@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GCN-NEXT: s_movk_i32 s2, 0x50 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GCN-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v5 +; GCN-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, s2, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v5, v0 +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[4:5] +; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GI-LABEL: lshl_add_u64_gep_shift: +; GI: ; %bb.0: ; %entry +; GI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GI-NEXT: s_getpc_b64 s[0:1] +; GI-NEXT: s_add_u32 s0, s0, arr@gotpcrel32@lo+4 +; GI-NEXT: s_addc_u32 s1, s1, arr@gotpcrel32@hi+12 +; GI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GI-NEXT: v_mov_b32_e32 v6, 0x50 +; GI-NEXT: v_mad_u64_u32 v[4:5], s[2:3], v0, v6, 0 +; GI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v1, v6, 0 +; GI-NEXT: v_add_u32_e32 v5, v5, v0 +; GI-NEXT: s_waitcnt lgkmcnt(0) +; GI-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GI-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GI-NEXT: s_nop 1 +; GI-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GI-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GI-NEXT: s_setpc_b64 s[30:31] +entry: + %base = getelementptr [10 x [10 x i64]], ptr @arr, i64 0, i64 %row, i64 0 + %shifted_col = shl i64 %col, 2 ; multiply by sizeof(i64) (shift left by 2) + %ptr = getelementptr i8, ptr %base, i64 %shifted_col + %val = load i64, ptr %ptr + ret i64 %val +}