From f3e9035a670986b0ab47bc146520a8dc4168b3eb Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Wed, 23 Oct 2024 14:54:52 -0700 Subject: [PATCH 1/2] [AMDGPU] Split wide integer dpp8 intrinsic calls The int_amdgcn_mov_dpp8 is declared with llvm_anyint_ty, but we can only select i32. To allow a corresponding builtin to be overloaded the same way as int_amdgcn_mov_dpp we need it to be able to split unsupported i64 values. --- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 35 +++++++++++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll | 33 +++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index c49aab823b44a..4e25f8c946491 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -317,6 +317,7 @@ class AMDGPUCodeGenPrepareImpl bool visitBitreverseIntrinsicInst(IntrinsicInst &I); bool visitMinNum(IntrinsicInst &I); bool visitSqrt(IntrinsicInst &I); + bool visitMovDppIntrinsic(IntrinsicInst &I); bool run(Function &F); }; @@ -2099,6 +2100,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { return visitMinNum(I); case Intrinsic::sqrt: return visitSqrt(I); + case Intrinsic::amdgcn_mov_dpp8: + return visitMovDppIntrinsic(I); default: return false; } @@ -2257,6 +2260,38 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { return true; } +// Split unsupported wide integer calls. +bool AMDGPUCodeGenPrepareImpl::visitMovDppIntrinsic(IntrinsicInst &I) { + Type *SrcTy = I.getType(); + assert(SrcTy->isIntegerTy()); + unsigned Size = SrcTy->getPrimitiveSizeInBits(); + assert(Size % 32 == 0); + if (Size <= 32) + return false; + + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + unsigned NumElt = Size / 32; + IntegerType *EltTy = Builder.getInt32Ty(); + Type *VecTy = VectorType::get(EltTy, NumElt, false); + Value *Vec = Builder.CreateBitCast(I.getArgOperand(0), VecTy); + + unsigned IID = I.getIntrinsicID(); + SmallVector Args(I.args()); + SmallVector Elts; + for (unsigned N = 0; N != NumElt; ++N) { + Args[0] = Builder.CreateExtractElement(Vec, N); + Elts.push_back(Builder.CreateIntrinsic(EltTy, IID, Args)); + } + + Value *DppVec = insertValues(Builder, VecTy, Elts); + Value *NewVal = Builder.CreateBitCast(DppVec, SrcTy); + NewVal->takeName(&I); + I.replaceAllUsesWith(NewVal); + I.eraseFromParent(); + return true; +} + bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { Impl.Mod = &M; Impl.DL = &Impl.Mod->getDataLayout(); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll index 8bff17b729927..35aac8533aa15 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll @@ -24,6 +24,39 @@ define amdgpu_kernel void @dpp8_wait_states(ptr addrspace(1) %out, i32 %in) { ret void } +; GFX10PLUS-LABEL: {{^}}dpp8_i64: +; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off +define amdgpu_ps void @dpp8_i64(i64 %in, ptr addrspace(1) %out) { + %tmp0 = call i64 @llvm.amdgcn.mov.dpp8.i64(i64 %in, i32 1) #0 + store i64 %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_i128: +; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS: global_store_{{dwordx4|b128}} v[4:5], v[0:3], off +define amdgpu_ps void @dpp8_i128(i128 %in, ptr addrspace(1) %out) { + %tmp0 = call i128 @llvm.amdgcn.mov.dpp8.i128(i128 %in, i32 1) #0 + store i128 %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_i96: +; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off +define amdgpu_ps void @dpp8_i96(i96 %in, ptr addrspace(1) %out) { + %tmp0 = call i96 @llvm.amdgcn.mov.dpp8.i96(i96 %in, i32 1) #0 + store i96 %tmp0, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #0 attributes #0 = { nounwind readnone convergent } From c35107b598f8f869df0447631d5ce8b43f660fc6 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Mon, 28 Oct 2024 12:29:58 -0700 Subject: [PATCH 2/2] Exit on sub-dword size before the assert it is divisable by 32. --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 4e25f8c946491..2432cac95b588 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -2265,9 +2265,9 @@ bool AMDGPUCodeGenPrepareImpl::visitMovDppIntrinsic(IntrinsicInst &I) { Type *SrcTy = I.getType(); assert(SrcTy->isIntegerTy()); unsigned Size = SrcTy->getPrimitiveSizeInBits(); - assert(Size % 32 == 0); if (Size <= 32) return false; + assert(Size % 32 == 0); IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc());