Skip to content

Commit 2dd2a2d

Browse files
committed
Allow sinking of free vector ops
1 parent f802acf commit 2dd2a2d

File tree

6 files changed

+1098
-932
lines changed

6 files changed

+1098
-932
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,6 +1301,87 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
13011301

13021302
if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
13031303
Ops.push_back(&Op);
1304+
1305+
// Zero cost vector instructions (e.g. extractelement 0 of i32 vectors)
1306+
// will be optimized away, and sinking them can help SDAG combines.
1307+
const DataLayout &DL = I->getModule()->getDataLayout();
1308+
1309+
uint64_t VecIndex;
1310+
Value *Vec;
1311+
if (match(Op.get(), m_ExtractElt(m_Value(Vec), m_ConstantInt(VecIndex)))) {
1312+
Instruction *OpInst = cast<Instruction>(Op.get());
1313+
Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0));
1314+
// If a zero cost extractvector instruction is the only use of the vector,
1315+
// then it may be combined with the def.
1316+
if (VecOpInst && VecOpInst->hasOneUse())
1317+
continue;
1318+
1319+
if (getVectorInstrCost(OpInst->getOpcode(), Vec->getType(),
1320+
TTI::TCK_RecipThroughput, VecIndex,
1321+
OpInst->getOperand(0), OpInst->getOperand(1)) == 0)
1322+
Ops.push_back(&Op);
1323+
1324+
continue;
1325+
}
1326+
1327+
if (match(Op.get(),
1328+
m_InsertElt(m_Value(Vec), m_Value(), m_ConstantInt(VecIndex)))) {
1329+
Instruction *OpInst = cast<Instruction>(Op.get());
1330+
if (getVectorInstrCost(OpInst->getOpcode(), Vec->getType(),
1331+
TTI::TCK_RecipThroughput, VecIndex,
1332+
OpInst->getOperand(0), OpInst->getOperand(1)) == 0)
1333+
Ops.push_back(&Op);
1334+
1335+
continue;
1336+
}
1337+
1338+
if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
1339+
if (Shuffle->isIdentity()) {
1340+
Ops.push_back(&Op);
1341+
continue;
1342+
}
1343+
1344+
unsigned EltSize = DL.getTypeSizeInBits(
1345+
cast<VectorType>(cast<VectorType>(Shuffle->getType()))
1346+
->getElementType());
1347+
1348+
// For i32 (or greater) shufflevectors, these will be lowered into a
1349+
// series of insert / extract elements, which will be coalesced away.
1350+
if (EltSize >= 32) {
1351+
Ops.push_back(&Op);
1352+
continue;
1353+
}
1354+
1355+
if (EltSize < 16 || !ST->has16BitInsts())
1356+
continue;
1357+
1358+
int NumSubElts, SubIndex;
1359+
if (Shuffle->changesLength()) {
1360+
if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1361+
Ops.push_back(&Op);
1362+
continue;
1363+
}
1364+
1365+
if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
1366+
Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
1367+
!(SubIndex % 2)) {
1368+
Ops.push_back(&Op);
1369+
continue;
1370+
}
1371+
}
1372+
1373+
if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1374+
Shuffle->isSingleSource()) {
1375+
Ops.push_back(&Op);
1376+
continue;
1377+
}
1378+
1379+
if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex) &&
1380+
!(SubIndex % 2)) {
1381+
Ops.push_back(&Op);
1382+
continue;
1383+
}
1384+
}
13041385
}
13051386

13061387
return !Ops.empty();

llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll

Lines changed: 60 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -2149,11 +2149,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
21492149
; CI-NEXT: s_cbranch_vccz .LBB11_2
21502150
; CI-NEXT: ; %bb.1: ; %frem.else
21512151
; CI-NEXT: s_and_b32 s6, s2, 0x80000000
2152-
; CI-NEXT: v_mov_b32_e32 v1, s4
2153-
; CI-NEXT: v_mov_b32_e32 v0, s2
2154-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
2155-
; CI-NEXT: v_mov_b32_e32 v1, s6
2156-
; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2152+
; CI-NEXT: v_mov_b32_e32 v0, s4
2153+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v0|
2154+
; CI-NEXT: v_mov_b32_e32 v0, s6
2155+
; CI-NEXT: v_mov_b32_e32 v1, s2
2156+
; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
21572157
; CI-NEXT: s_mov_b32 s6, 0
21582158
; CI-NEXT: .LBB11_2: ; %Flow53
21592159
; CI-NEXT: s_xor_b32 s6, s6, 1
@@ -2224,11 +2224,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
22242224
; CI-NEXT: s_cbranch_vccz .LBB11_10
22252225
; CI-NEXT: ; %bb.9: ; %frem.else16
22262226
; CI-NEXT: s_and_b32 s6, s3, 0x80000000
2227-
; CI-NEXT: v_mov_b32_e32 v2, s5
2228-
; CI-NEXT: v_mov_b32_e32 v1, s3
2229-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2|
2230-
; CI-NEXT: v_mov_b32_e32 v2, s6
2231-
; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2227+
; CI-NEXT: v_mov_b32_e32 v1, s5
2228+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v1|
2229+
; CI-NEXT: v_mov_b32_e32 v1, s6
2230+
; CI-NEXT: v_mov_b32_e32 v2, s3
2231+
; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
22322232
; CI-NEXT: s_mov_b32 s6, 0
22332233
; CI-NEXT: .LBB11_10: ; %Flow49
22342234
; CI-NEXT: s_xor_b32 s6, s6, 1
@@ -2322,11 +2322,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
23222322
; VI-NEXT: s_cbranch_vccz .LBB11_2
23232323
; VI-NEXT: ; %bb.1: ; %frem.else
23242324
; VI-NEXT: s_and_b32 s6, s2, 0x80000000
2325-
; VI-NEXT: v_mov_b32_e32 v1, s4
2326-
; VI-NEXT: v_mov_b32_e32 v0, s2
2327-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
2328-
; VI-NEXT: v_mov_b32_e32 v1, s6
2329-
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2325+
; VI-NEXT: v_mov_b32_e32 v0, s4
2326+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v0|
2327+
; VI-NEXT: v_mov_b32_e32 v0, s6
2328+
; VI-NEXT: v_mov_b32_e32 v1, s2
2329+
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
23302330
; VI-NEXT: s_mov_b32 s6, 0
23312331
; VI-NEXT: .LBB11_2: ; %Flow53
23322332
; VI-NEXT: s_xor_b32 s6, s6, 1
@@ -2397,11 +2397,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
23972397
; VI-NEXT: s_cbranch_vccz .LBB11_10
23982398
; VI-NEXT: ; %bb.9: ; %frem.else16
23992399
; VI-NEXT: s_and_b32 s6, s3, 0x80000000
2400-
; VI-NEXT: v_mov_b32_e32 v2, s5
2401-
; VI-NEXT: v_mov_b32_e32 v1, s3
2402-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2|
2403-
; VI-NEXT: v_mov_b32_e32 v2, s6
2404-
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2400+
; VI-NEXT: v_mov_b32_e32 v1, s5
2401+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v1|
2402+
; VI-NEXT: v_mov_b32_e32 v1, s6
2403+
; VI-NEXT: v_mov_b32_e32 v2, s3
2404+
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
24052405
; VI-NEXT: s_mov_b32 s6, 0
24062406
; VI-NEXT: .LBB11_10: ; %Flow49
24072407
; VI-NEXT: s_xor_b32 s6, s6, 1
@@ -2503,11 +2503,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
25032503
; CI-NEXT: s_cbranch_vccz .LBB12_2
25042504
; CI-NEXT: ; %bb.1: ; %frem.else
25052505
; CI-NEXT: s_and_b32 s2, s4, 0x80000000
2506-
; CI-NEXT: v_mov_b32_e32 v1, s8
2507-
; CI-NEXT: v_mov_b32_e32 v0, s4
2508-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1|
2509-
; CI-NEXT: v_mov_b32_e32 v1, s2
2510-
; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2506+
; CI-NEXT: v_mov_b32_e32 v0, s8
2507+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v0|
2508+
; CI-NEXT: v_mov_b32_e32 v0, s2
2509+
; CI-NEXT: v_mov_b32_e32 v1, s4
2510+
; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
25112511
; CI-NEXT: s_mov_b32 s2, 0
25122512
; CI-NEXT: .LBB12_2: ; %Flow127
25132513
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2578,11 +2578,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
25782578
; CI-NEXT: s_cbranch_vccz .LBB12_10
25792579
; CI-NEXT: ; %bb.9: ; %frem.else16
25802580
; CI-NEXT: s_and_b32 s2, s5, 0x80000000
2581-
; CI-NEXT: v_mov_b32_e32 v2, s9
2582-
; CI-NEXT: v_mov_b32_e32 v1, s5
2583-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2|
2584-
; CI-NEXT: v_mov_b32_e32 v2, s2
2585-
; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2581+
; CI-NEXT: v_mov_b32_e32 v1, s9
2582+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v1|
2583+
; CI-NEXT: v_mov_b32_e32 v1, s2
2584+
; CI-NEXT: v_mov_b32_e32 v2, s5
2585+
; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
25862586
; CI-NEXT: s_mov_b32 s2, 0
25872587
; CI-NEXT: .LBB12_10: ; %Flow123
25882588
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2653,11 +2653,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
26532653
; CI-NEXT: s_cbranch_vccz .LBB12_18
26542654
; CI-NEXT: ; %bb.17: ; %frem.else47
26552655
; CI-NEXT: s_and_b32 s2, s6, 0x80000000
2656-
; CI-NEXT: v_mov_b32_e32 v3, s10
2657-
; CI-NEXT: v_mov_b32_e32 v2, s6
2658-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3|
2659-
; CI-NEXT: v_mov_b32_e32 v3, s2
2660-
; CI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
2656+
; CI-NEXT: v_mov_b32_e32 v2, s10
2657+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v2|
2658+
; CI-NEXT: v_mov_b32_e32 v2, s2
2659+
; CI-NEXT: v_mov_b32_e32 v3, s6
2660+
; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
26612661
; CI-NEXT: s_mov_b32 s2, 0
26622662
; CI-NEXT: .LBB12_18: ; %Flow119
26632663
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2728,11 +2728,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
27282728
; CI-NEXT: s_cbranch_vccz .LBB12_26
27292729
; CI-NEXT: ; %bb.25: ; %frem.else78
27302730
; CI-NEXT: s_and_b32 s2, s7, 0x80000000
2731-
; CI-NEXT: v_mov_b32_e32 v4, s11
2732-
; CI-NEXT: v_mov_b32_e32 v3, s7
2733-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4|
2734-
; CI-NEXT: v_mov_b32_e32 v4, s2
2735-
; CI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
2731+
; CI-NEXT: v_mov_b32_e32 v3, s11
2732+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v3|
2733+
; CI-NEXT: v_mov_b32_e32 v3, s2
2734+
; CI-NEXT: v_mov_b32_e32 v4, s7
2735+
; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
27362736
; CI-NEXT: s_mov_b32 s2, 0
27372737
; CI-NEXT: .LBB12_26: ; %Flow115
27382738
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2834,11 +2834,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
28342834
; VI-NEXT: s_cbranch_vccz .LBB12_2
28352835
; VI-NEXT: ; %bb.1: ; %frem.else
28362836
; VI-NEXT: s_and_b32 s2, s4, 0x80000000
2837-
; VI-NEXT: v_mov_b32_e32 v1, s8
2838-
; VI-NEXT: v_mov_b32_e32 v0, s4
2839-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1|
2840-
; VI-NEXT: v_mov_b32_e32 v1, s2
2841-
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2837+
; VI-NEXT: v_mov_b32_e32 v0, s8
2838+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v0|
2839+
; VI-NEXT: v_mov_b32_e32 v0, s2
2840+
; VI-NEXT: v_mov_b32_e32 v1, s4
2841+
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
28422842
; VI-NEXT: s_mov_b32 s2, 0
28432843
; VI-NEXT: .LBB12_2: ; %Flow127
28442844
; VI-NEXT: s_xor_b32 s2, s2, 1
@@ -2909,11 +2909,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
29092909
; VI-NEXT: s_cbranch_vccz .LBB12_10
29102910
; VI-NEXT: ; %bb.9: ; %frem.else16
29112911
; VI-NEXT: s_and_b32 s2, s5, 0x80000000
2912-
; VI-NEXT: v_mov_b32_e32 v2, s9
2913-
; VI-NEXT: v_mov_b32_e32 v1, s5
2914-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2|
2915-
; VI-NEXT: v_mov_b32_e32 v2, s2
2916-
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2912+
; VI-NEXT: v_mov_b32_e32 v1, s9
2913+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v1|
2914+
; VI-NEXT: v_mov_b32_e32 v1, s2
2915+
; VI-NEXT: v_mov_b32_e32 v2, s5
2916+
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
29172917
; VI-NEXT: s_mov_b32 s2, 0
29182918
; VI-NEXT: .LBB12_10: ; %Flow123
29192919
; VI-NEXT: s_xor_b32 s2, s2, 1
@@ -2984,11 +2984,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
29842984
; VI-NEXT: s_cbranch_vccz .LBB12_18
29852985
; VI-NEXT: ; %bb.17: ; %frem.else47
29862986
; VI-NEXT: s_and_b32 s2, s6, 0x80000000
2987-
; VI-NEXT: v_mov_b32_e32 v3, s10
2988-
; VI-NEXT: v_mov_b32_e32 v2, s6
2989-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3|
2990-
; VI-NEXT: v_mov_b32_e32 v3, s2
2991-
; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
2987+
; VI-NEXT: v_mov_b32_e32 v2, s10
2988+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v2|
2989+
; VI-NEXT: v_mov_b32_e32 v2, s2
2990+
; VI-NEXT: v_mov_b32_e32 v3, s6
2991+
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
29922992
; VI-NEXT: s_mov_b32 s2, 0
29932993
; VI-NEXT: .LBB12_18: ; %Flow119
29942994
; VI-NEXT: s_xor_b32 s2, s2, 1
@@ -3059,11 +3059,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
30593059
; VI-NEXT: s_cbranch_vccz .LBB12_26
30603060
; VI-NEXT: ; %bb.25: ; %frem.else78
30613061
; VI-NEXT: s_and_b32 s2, s7, 0x80000000
3062-
; VI-NEXT: v_mov_b32_e32 v4, s11
3063-
; VI-NEXT: v_mov_b32_e32 v3, s7
3064-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4|
3065-
; VI-NEXT: v_mov_b32_e32 v4, s2
3066-
; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
3062+
; VI-NEXT: v_mov_b32_e32 v3, s11
3063+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v3|
3064+
; VI-NEXT: v_mov_b32_e32 v3, s2
3065+
; VI-NEXT: v_mov_b32_e32 v4, s7
3066+
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
30673067
; VI-NEXT: s_mov_b32 s2, 0
30683068
; VI-NEXT: .LBB12_26: ; %Flow115
30693069
; VI-NEXT: s_xor_b32 s2, s2, 1

0 commit comments

Comments
 (0)