Skip to content

Commit f3e9035

Browse files
committed
[AMDGPU] Split wide integer dpp8 intrinsic calls
The int_amdgcn_mov_dpp8 is declared with llvm_anyint_ty, but we can only select i32. To allow a corresponding builtin to be overloaded the same way as int_amdgcn_mov_dpp we need it to be able to split unsupported i64 values.
1 parent 2dfb1c6 commit f3e9035

File tree

2 files changed

+68
-0
lines changed

2 files changed

+68
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,7 @@ class AMDGPUCodeGenPrepareImpl
317317
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
318318
bool visitMinNum(IntrinsicInst &I);
319319
bool visitSqrt(IntrinsicInst &I);
320+
bool visitMovDppIntrinsic(IntrinsicInst &I);
320321
bool run(Function &F);
321322
};
322323

@@ -2099,6 +2100,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
20992100
return visitMinNum(I);
21002101
case Intrinsic::sqrt:
21012102
return visitSqrt(I);
2103+
case Intrinsic::amdgcn_mov_dpp8:
2104+
return visitMovDppIntrinsic(I);
21022105
default:
21032106
return false;
21042107
}
@@ -2257,6 +2260,38 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
22572260
return true;
22582261
}
22592262

2263+
// Split unsupported wide integer calls.
2264+
bool AMDGPUCodeGenPrepareImpl::visitMovDppIntrinsic(IntrinsicInst &I) {
2265+
Type *SrcTy = I.getType();
2266+
assert(SrcTy->isIntegerTy());
2267+
unsigned Size = SrcTy->getPrimitiveSizeInBits();
2268+
assert(Size % 32 == 0);
2269+
if (Size <= 32)
2270+
return false;
2271+
2272+
IRBuilder<> Builder(&I);
2273+
Builder.SetCurrentDebugLocation(I.getDebugLoc());
2274+
unsigned NumElt = Size / 32;
2275+
IntegerType *EltTy = Builder.getInt32Ty();
2276+
Type *VecTy = VectorType::get(EltTy, NumElt, false);
2277+
Value *Vec = Builder.CreateBitCast(I.getArgOperand(0), VecTy);
2278+
2279+
unsigned IID = I.getIntrinsicID();
2280+
SmallVector<Value *, 6> Args(I.args());
2281+
SmallVector<Value *, 4> Elts;
2282+
for (unsigned N = 0; N != NumElt; ++N) {
2283+
Args[0] = Builder.CreateExtractElement(Vec, N);
2284+
Elts.push_back(Builder.CreateIntrinsic(EltTy, IID, Args));
2285+
}
2286+
2287+
Value *DppVec = insertValues(Builder, VecTy, Elts);
2288+
Value *NewVal = Builder.CreateBitCast(DppVec, SrcTy);
2289+
NewVal->takeName(&I);
2290+
I.replaceAllUsesWith(NewVal);
2291+
I.eraseFromParent();
2292+
return true;
2293+
}
2294+
22602295
bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
22612296
Impl.Mod = &M;
22622297
Impl.DL = &Impl.Mod->getDataLayout();

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,39 @@ define amdgpu_kernel void @dpp8_wait_states(ptr addrspace(1) %out, i32 %in) {
2424
ret void
2525
}
2626

27+
; GFX10PLUS-LABEL: {{^}}dpp8_i64:
28+
; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
29+
; GFX10PLUS: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
30+
; GFX10PLUS: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
31+
define amdgpu_ps void @dpp8_i64(i64 %in, ptr addrspace(1) %out) {
32+
%tmp0 = call i64 @llvm.amdgcn.mov.dpp8.i64(i64 %in, i32 1) #0
33+
store i64 %tmp0, ptr addrspace(1) %out
34+
ret void
35+
}
36+
37+
; GFX10PLUS-LABEL: {{^}}dpp8_i128:
38+
; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
39+
; GFX10PLUS: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
40+
; GFX10PLUS: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
41+
; GFX10PLUS: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0]
42+
; GFX10PLUS: global_store_{{dwordx4|b128}} v[4:5], v[0:3], off
43+
define amdgpu_ps void @dpp8_i128(i128 %in, ptr addrspace(1) %out) {
44+
%tmp0 = call i128 @llvm.amdgcn.mov.dpp8.i128(i128 %in, i32 1) #0
45+
store i128 %tmp0, ptr addrspace(1) %out
46+
ret void
47+
}
48+
49+
; GFX10PLUS-LABEL: {{^}}dpp8_i96:
50+
; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
51+
; GFX10PLUS: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
52+
; GFX10PLUS: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
53+
; GFX10PLUS: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
54+
define amdgpu_ps void @dpp8_i96(i96 %in, ptr addrspace(1) %out) {
55+
%tmp0 = call i96 @llvm.amdgcn.mov.dpp8.i96(i96 %in, i32 1) #0
56+
store i96 %tmp0, ptr addrspace(1) %out
57+
ret void
58+
}
59+
2760
declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #0
2861

2962
attributes #0 = { nounwind readnone convergent }

0 commit comments

Comments
 (0)