Skip to content

Commit b929e11

Browse files
committed
Allow sinking of free vector ops
1 parent b20d35c commit b929e11

File tree

6 files changed

+1026
-996
lines changed

6 files changed

+1026
-996
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1378,6 +1378,87 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
13781378

13791379
if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
13801380
Ops.push_back(&Op);
1381+
1382+
// Zero cost vector instructions (e.g. extractelement 0 of i32 vectors)
1383+
// will be optimized away, and sinking them can help SDAG combines.
1384+
const DataLayout &DL = I->getModule()->getDataLayout();
1385+
1386+
uint64_t VecIndex;
1387+
Value *Vec;
1388+
if (match(Op.get(), m_ExtractElt(m_Value(Vec), m_ConstantInt(VecIndex)))) {
1389+
Instruction *OpInst = cast<Instruction>(Op.get());
1390+
Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0));
1391+
// If a zero cost extractvector instruction is the only use of the vector,
1392+
// then it may be combined with the def.
1393+
if (VecOpInst && VecOpInst->hasOneUse())
1394+
continue;
1395+
1396+
if (getVectorInstrCost(OpInst->getOpcode(), Vec->getType(),
1397+
TTI::TCK_RecipThroughput, VecIndex,
1398+
OpInst->getOperand(0), OpInst->getOperand(1)) == 0)
1399+
Ops.push_back(&Op);
1400+
1401+
continue;
1402+
}
1403+
1404+
if (match(Op.get(),
1405+
m_InsertElt(m_Value(Vec), m_Value(), m_ConstantInt(VecIndex)))) {
1406+
Instruction *OpInst = cast<Instruction>(Op.get());
1407+
if (getVectorInstrCost(OpInst->getOpcode(), Vec->getType(),
1408+
TTI::TCK_RecipThroughput, VecIndex,
1409+
OpInst->getOperand(0), OpInst->getOperand(1)) == 0)
1410+
Ops.push_back(&Op);
1411+
1412+
continue;
1413+
}
1414+
1415+
if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
1416+
if (Shuffle->isIdentity()) {
1417+
Ops.push_back(&Op);
1418+
continue;
1419+
}
1420+
1421+
unsigned EltSize = DL.getTypeSizeInBits(
1422+
cast<VectorType>(cast<VectorType>(Shuffle->getType()))
1423+
->getElementType());
1424+
1425+
// For i32 (or greater) shufflevectors, these will be lowered into a
1426+
// series of insert / extract elements, which will be coalesced away.
1427+
if (EltSize >= 32) {
1428+
Ops.push_back(&Op);
1429+
continue;
1430+
}
1431+
1432+
if (EltSize < 16 || !ST->has16BitInsts())
1433+
continue;
1434+
1435+
int NumSubElts, SubIndex;
1436+
if (Shuffle->changesLength()) {
1437+
if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1438+
Ops.push_back(&Op);
1439+
continue;
1440+
}
1441+
1442+
if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
1443+
Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
1444+
!(SubIndex & 0x1)) {
1445+
Ops.push_back(&Op);
1446+
continue;
1447+
}
1448+
}
1449+
1450+
if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1451+
Shuffle->isSingleSource()) {
1452+
Ops.push_back(&Op);
1453+
continue;
1454+
}
1455+
1456+
if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex) &&
1457+
!(SubIndex & 0x1)) {
1458+
Ops.push_back(&Op);
1459+
continue;
1460+
}
1461+
}
13811462
}
13821463

13831464
return !Ops.empty();

llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll

Lines changed: 60 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -2146,11 +2146,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
21462146
; CI-NEXT: s_cbranch_vccz .LBB11_2
21472147
; CI-NEXT: ; %bb.1: ; %frem.else16
21482148
; CI-NEXT: s_and_b32 s6, s2, 0x80000000
2149-
; CI-NEXT: v_mov_b32_e32 v1, s4
2150-
; CI-NEXT: v_mov_b32_e32 v0, s2
2151-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
2152-
; CI-NEXT: v_mov_b32_e32 v1, s6
2153-
; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2149+
; CI-NEXT: v_mov_b32_e32 v0, s4
2150+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v0|
2151+
; CI-NEXT: v_mov_b32_e32 v0, s6
2152+
; CI-NEXT: v_mov_b32_e32 v1, s2
2153+
; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
21542154
; CI-NEXT: s_mov_b32 s6, 0
21552155
; CI-NEXT: .LBB11_2: ; %Flow53
21562156
; CI-NEXT: s_xor_b32 s6, s6, 1
@@ -2221,11 +2221,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
22212221
; CI-NEXT: s_cbranch_vccz .LBB11_10
22222222
; CI-NEXT: ; %bb.9: ; %frem.else
22232223
; CI-NEXT: s_and_b32 s6, s3, 0x80000000
2224-
; CI-NEXT: v_mov_b32_e32 v2, s5
2225-
; CI-NEXT: v_mov_b32_e32 v1, s3
2226-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2|
2227-
; CI-NEXT: v_mov_b32_e32 v2, s6
2228-
; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2224+
; CI-NEXT: v_mov_b32_e32 v1, s5
2225+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v1|
2226+
; CI-NEXT: v_mov_b32_e32 v1, s6
2227+
; CI-NEXT: v_mov_b32_e32 v2, s3
2228+
; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
22292229
; CI-NEXT: s_mov_b32 s6, 0
22302230
; CI-NEXT: .LBB11_10: ; %Flow49
22312231
; CI-NEXT: s_xor_b32 s6, s6, 1
@@ -2319,11 +2319,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
23192319
; VI-NEXT: s_cbranch_vccz .LBB11_2
23202320
; VI-NEXT: ; %bb.1: ; %frem.else16
23212321
; VI-NEXT: s_and_b32 s6, s2, 0x80000000
2322-
; VI-NEXT: v_mov_b32_e32 v1, s4
2323-
; VI-NEXT: v_mov_b32_e32 v0, s2
2324-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
2325-
; VI-NEXT: v_mov_b32_e32 v1, s6
2326-
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2322+
; VI-NEXT: v_mov_b32_e32 v0, s4
2323+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v0|
2324+
; VI-NEXT: v_mov_b32_e32 v0, s6
2325+
; VI-NEXT: v_mov_b32_e32 v1, s2
2326+
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
23272327
; VI-NEXT: s_mov_b32 s6, 0
23282328
; VI-NEXT: .LBB11_2: ; %Flow53
23292329
; VI-NEXT: s_xor_b32 s6, s6, 1
@@ -2394,11 +2394,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
23942394
; VI-NEXT: s_cbranch_vccz .LBB11_10
23952395
; VI-NEXT: ; %bb.9: ; %frem.else
23962396
; VI-NEXT: s_and_b32 s6, s3, 0x80000000
2397-
; VI-NEXT: v_mov_b32_e32 v2, s5
2398-
; VI-NEXT: v_mov_b32_e32 v1, s3
2399-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2|
2400-
; VI-NEXT: v_mov_b32_e32 v2, s6
2401-
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2397+
; VI-NEXT: v_mov_b32_e32 v1, s5
2398+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v1|
2399+
; VI-NEXT: v_mov_b32_e32 v1, s6
2400+
; VI-NEXT: v_mov_b32_e32 v2, s3
2401+
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
24022402
; VI-NEXT: s_mov_b32 s6, 0
24032403
; VI-NEXT: .LBB11_10: ; %Flow49
24042404
; VI-NEXT: s_xor_b32 s6, s6, 1
@@ -2500,11 +2500,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
25002500
; CI-NEXT: s_cbranch_vccz .LBB12_2
25012501
; CI-NEXT: ; %bb.1: ; %frem.else78
25022502
; CI-NEXT: s_and_b32 s2, s4, 0x80000000
2503-
; CI-NEXT: v_mov_b32_e32 v1, s8
2504-
; CI-NEXT: v_mov_b32_e32 v0, s4
2505-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1|
2506-
; CI-NEXT: v_mov_b32_e32 v1, s2
2507-
; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2503+
; CI-NEXT: v_mov_b32_e32 v0, s8
2504+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v0|
2505+
; CI-NEXT: v_mov_b32_e32 v0, s2
2506+
; CI-NEXT: v_mov_b32_e32 v1, s4
2507+
; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
25082508
; CI-NEXT: s_mov_b32 s2, 0
25092509
; CI-NEXT: .LBB12_2: ; %Flow127
25102510
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2575,11 +2575,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
25752575
; CI-NEXT: s_cbranch_vccz .LBB12_10
25762576
; CI-NEXT: ; %bb.9: ; %frem.else47
25772577
; CI-NEXT: s_and_b32 s2, s5, 0x80000000
2578-
; CI-NEXT: v_mov_b32_e32 v2, s9
2579-
; CI-NEXT: v_mov_b32_e32 v1, s5
2580-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2|
2581-
; CI-NEXT: v_mov_b32_e32 v2, s2
2582-
; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2578+
; CI-NEXT: v_mov_b32_e32 v1, s9
2579+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v1|
2580+
; CI-NEXT: v_mov_b32_e32 v1, s2
2581+
; CI-NEXT: v_mov_b32_e32 v2, s5
2582+
; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
25832583
; CI-NEXT: s_mov_b32 s2, 0
25842584
; CI-NEXT: .LBB12_10: ; %Flow123
25852585
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2650,11 +2650,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
26502650
; CI-NEXT: s_cbranch_vccz .LBB12_18
26512651
; CI-NEXT: ; %bb.17: ; %frem.else16
26522652
; CI-NEXT: s_and_b32 s2, s6, 0x80000000
2653-
; CI-NEXT: v_mov_b32_e32 v3, s10
2654-
; CI-NEXT: v_mov_b32_e32 v2, s6
2655-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3|
2656-
; CI-NEXT: v_mov_b32_e32 v3, s2
2657-
; CI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
2653+
; CI-NEXT: v_mov_b32_e32 v2, s10
2654+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v2|
2655+
; CI-NEXT: v_mov_b32_e32 v2, s2
2656+
; CI-NEXT: v_mov_b32_e32 v3, s6
2657+
; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
26582658
; CI-NEXT: s_mov_b32 s2, 0
26592659
; CI-NEXT: .LBB12_18: ; %Flow119
26602660
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2725,11 +2725,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
27252725
; CI-NEXT: s_cbranch_vccz .LBB12_26
27262726
; CI-NEXT: ; %bb.25: ; %frem.else
27272727
; CI-NEXT: s_and_b32 s2, s7, 0x80000000
2728-
; CI-NEXT: v_mov_b32_e32 v4, s11
2729-
; CI-NEXT: v_mov_b32_e32 v3, s7
2730-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4|
2731-
; CI-NEXT: v_mov_b32_e32 v4, s2
2732-
; CI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
2728+
; CI-NEXT: v_mov_b32_e32 v3, s11
2729+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v3|
2730+
; CI-NEXT: v_mov_b32_e32 v3, s2
2731+
; CI-NEXT: v_mov_b32_e32 v4, s7
2732+
; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
27332733
; CI-NEXT: s_mov_b32 s2, 0
27342734
; CI-NEXT: .LBB12_26: ; %Flow115
27352735
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2831,11 +2831,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
28312831
; VI-NEXT: s_cbranch_vccz .LBB12_2
28322832
; VI-NEXT: ; %bb.1: ; %frem.else78
28332833
; VI-NEXT: s_and_b32 s2, s4, 0x80000000
2834-
; VI-NEXT: v_mov_b32_e32 v1, s8
2835-
; VI-NEXT: v_mov_b32_e32 v0, s4
2836-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1|
2837-
; VI-NEXT: v_mov_b32_e32 v1, s2
2838-
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2834+
; VI-NEXT: v_mov_b32_e32 v0, s8
2835+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v0|
2836+
; VI-NEXT: v_mov_b32_e32 v0, s2
2837+
; VI-NEXT: v_mov_b32_e32 v1, s4
2838+
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
28392839
; VI-NEXT: s_mov_b32 s2, 0
28402840
; VI-NEXT: .LBB12_2: ; %Flow127
28412841
; VI-NEXT: s_xor_b32 s2, s2, 1
@@ -2906,11 +2906,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
29062906
; VI-NEXT: s_cbranch_vccz .LBB12_10
29072907
; VI-NEXT: ; %bb.9: ; %frem.else47
29082908
; VI-NEXT: s_and_b32 s2, s5, 0x80000000
2909-
; VI-NEXT: v_mov_b32_e32 v2, s9
2910-
; VI-NEXT: v_mov_b32_e32 v1, s5
2911-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2|
2912-
; VI-NEXT: v_mov_b32_e32 v2, s2
2913-
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2909+
; VI-NEXT: v_mov_b32_e32 v1, s9
2910+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v1|
2911+
; VI-NEXT: v_mov_b32_e32 v1, s2
2912+
; VI-NEXT: v_mov_b32_e32 v2, s5
2913+
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
29142914
; VI-NEXT: s_mov_b32 s2, 0
29152915
; VI-NEXT: .LBB12_10: ; %Flow123
29162916
; VI-NEXT: s_xor_b32 s2, s2, 1
@@ -2981,11 +2981,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
29812981
; VI-NEXT: s_cbranch_vccz .LBB12_18
29822982
; VI-NEXT: ; %bb.17: ; %frem.else16
29832983
; VI-NEXT: s_and_b32 s2, s6, 0x80000000
2984-
; VI-NEXT: v_mov_b32_e32 v3, s10
2985-
; VI-NEXT: v_mov_b32_e32 v2, s6
2986-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3|
2987-
; VI-NEXT: v_mov_b32_e32 v3, s2
2988-
; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
2984+
; VI-NEXT: v_mov_b32_e32 v2, s10
2985+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v2|
2986+
; VI-NEXT: v_mov_b32_e32 v2, s2
2987+
; VI-NEXT: v_mov_b32_e32 v3, s6
2988+
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
29892989
; VI-NEXT: s_mov_b32 s2, 0
29902990
; VI-NEXT: .LBB12_18: ; %Flow119
29912991
; VI-NEXT: s_xor_b32 s2, s2, 1
@@ -3056,11 +3056,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
30563056
; VI-NEXT: s_cbranch_vccz .LBB12_26
30573057
; VI-NEXT: ; %bb.25: ; %frem.else
30583058
; VI-NEXT: s_and_b32 s2, s7, 0x80000000
3059-
; VI-NEXT: v_mov_b32_e32 v4, s11
3060-
; VI-NEXT: v_mov_b32_e32 v3, s7
3061-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4|
3062-
; VI-NEXT: v_mov_b32_e32 v4, s2
3063-
; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
3059+
; VI-NEXT: v_mov_b32_e32 v3, s11
3060+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v3|
3061+
; VI-NEXT: v_mov_b32_e32 v3, s2
3062+
; VI-NEXT: v_mov_b32_e32 v4, s7
3063+
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
30643064
; VI-NEXT: s_mov_b32 s2, 0
30653065
; VI-NEXT: .LBB12_26: ; %Flow115
30663066
; VI-NEXT: s_xor_b32 s2, s2, 1

0 commit comments

Comments
 (0)