Skip to content

Commit 0cf29f3

Browse files
committed
Allow sinking of free vector ops
1 parent 263377a commit 0cf29f3

File tree

6 files changed

+996
-966
lines changed

6 files changed

+996
-966
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,6 +1301,87 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
13011301

13021302
if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
13031303
Ops.push_back(&Op);
1304+
1305+
// Zero cost vector instructions (e.g. extractelement 0 of i32 vectors)
1306+
// will be optimized away, and sinking them can help SDAG combines.
1307+
const DataLayout &DL = I->getModule()->getDataLayout();
1308+
1309+
uint64_t VecIndex;
1310+
Value *Vec;
1311+
if (match(Op.get(), m_ExtractElt(m_Value(Vec), m_ConstantInt(VecIndex)))) {
1312+
Instruction *OpInst = cast<Instruction>(Op.get());
1313+
Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0));
1314+
// If a zero cost extractvector instruction is the only use of the vector,
1315+
// then it may be combined with the def.
1316+
if (VecOpInst && VecOpInst->hasOneUse())
1317+
continue;
1318+
1319+
if (getVectorInstrCost(OpInst->getOpcode(), Vec->getType(),
1320+
TTI::TCK_RecipThroughput, VecIndex,
1321+
OpInst->getOperand(0), OpInst->getOperand(1)) == 0)
1322+
Ops.push_back(&Op);
1323+
1324+
continue;
1325+
}
1326+
1327+
if (match(Op.get(),
1328+
m_InsertElt(m_Value(Vec), m_Value(), m_ConstantInt(VecIndex)))) {
1329+
Instruction *OpInst = cast<Instruction>(Op.get());
1330+
if (getVectorInstrCost(OpInst->getOpcode(), Vec->getType(),
1331+
TTI::TCK_RecipThroughput, VecIndex,
1332+
OpInst->getOperand(0), OpInst->getOperand(1)) == 0)
1333+
Ops.push_back(&Op);
1334+
1335+
continue;
1336+
}
1337+
1338+
if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
1339+
if (Shuffle->isIdentity()) {
1340+
Ops.push_back(&Op);
1341+
continue;
1342+
}
1343+
1344+
unsigned EltSize = DL.getTypeSizeInBits(
1345+
cast<VectorType>(cast<VectorType>(Shuffle->getType()))
1346+
->getElementType());
1347+
1348+
// For i32 (or greater) shufflevectors, these will be lowered into a
1349+
// series of insert / extract elements, which will be coalesced away.
1350+
if (EltSize >= 32) {
1351+
Ops.push_back(&Op);
1352+
continue;
1353+
}
1354+
1355+
if (EltSize < 16 || !ST->has16BitInsts())
1356+
continue;
1357+
1358+
int NumSubElts, SubIndex;
1359+
if (Shuffle->changesLength()) {
1360+
if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1361+
Ops.push_back(&Op);
1362+
continue;
1363+
}
1364+
1365+
if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
1366+
Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
1367+
!(SubIndex & 0x1)) {
1368+
Ops.push_back(&Op);
1369+
continue;
1370+
}
1371+
}
1372+
1373+
if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1374+
Shuffle->isSingleSource()) {
1375+
Ops.push_back(&Op);
1376+
continue;
1377+
}
1378+
1379+
if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex) &&
1380+
!(SubIndex & 0x1)) {
1381+
Ops.push_back(&Op);
1382+
continue;
1383+
}
1384+
}
13041385
}
13051386

13061387
return !Ops.empty();

llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll

Lines changed: 60 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -2146,11 +2146,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
21462146
; CI-NEXT: s_cbranch_vccz .LBB11_2
21472147
; CI-NEXT: ; %bb.1: ; %frem.else16
21482148
; CI-NEXT: s_and_b32 s6, s2, 0x80000000
2149-
; CI-NEXT: v_mov_b32_e32 v1, s4
2150-
; CI-NEXT: v_mov_b32_e32 v0, s2
2151-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
2152-
; CI-NEXT: v_mov_b32_e32 v1, s6
2153-
; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2149+
; CI-NEXT: v_mov_b32_e32 v0, s4
2150+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v0|
2151+
; CI-NEXT: v_mov_b32_e32 v0, s6
2152+
; CI-NEXT: v_mov_b32_e32 v1, s2
2153+
; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
21542154
; CI-NEXT: s_mov_b32 s6, 0
21552155
; CI-NEXT: .LBB11_2: ; %Flow53
21562156
; CI-NEXT: s_xor_b32 s6, s6, 1
@@ -2221,11 +2221,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
22212221
; CI-NEXT: s_cbranch_vccz .LBB11_10
22222222
; CI-NEXT: ; %bb.9: ; %frem.else
22232223
; CI-NEXT: s_and_b32 s6, s3, 0x80000000
2224-
; CI-NEXT: v_mov_b32_e32 v2, s5
2225-
; CI-NEXT: v_mov_b32_e32 v1, s3
2226-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2|
2227-
; CI-NEXT: v_mov_b32_e32 v2, s6
2228-
; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2224+
; CI-NEXT: v_mov_b32_e32 v1, s5
2225+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v1|
2226+
; CI-NEXT: v_mov_b32_e32 v1, s6
2227+
; CI-NEXT: v_mov_b32_e32 v2, s3
2228+
; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
22292229
; CI-NEXT: s_mov_b32 s6, 0
22302230
; CI-NEXT: .LBB11_10: ; %Flow49
22312231
; CI-NEXT: s_xor_b32 s6, s6, 1
@@ -2319,11 +2319,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
23192319
; VI-NEXT: s_cbranch_vccz .LBB11_2
23202320
; VI-NEXT: ; %bb.1: ; %frem.else16
23212321
; VI-NEXT: s_and_b32 s6, s2, 0x80000000
2322-
; VI-NEXT: v_mov_b32_e32 v1, s4
2323-
; VI-NEXT: v_mov_b32_e32 v0, s2
2324-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
2325-
; VI-NEXT: v_mov_b32_e32 v1, s6
2326-
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2322+
; VI-NEXT: v_mov_b32_e32 v0, s4
2323+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v0|
2324+
; VI-NEXT: v_mov_b32_e32 v0, s6
2325+
; VI-NEXT: v_mov_b32_e32 v1, s2
2326+
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
23272327
; VI-NEXT: s_mov_b32 s6, 0
23282328
; VI-NEXT: .LBB11_2: ; %Flow53
23292329
; VI-NEXT: s_xor_b32 s6, s6, 1
@@ -2394,11 +2394,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
23942394
; VI-NEXT: s_cbranch_vccz .LBB11_10
23952395
; VI-NEXT: ; %bb.9: ; %frem.else
23962396
; VI-NEXT: s_and_b32 s6, s3, 0x80000000
2397-
; VI-NEXT: v_mov_b32_e32 v2, s5
2398-
; VI-NEXT: v_mov_b32_e32 v1, s3
2399-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2|
2400-
; VI-NEXT: v_mov_b32_e32 v2, s6
2401-
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2397+
; VI-NEXT: v_mov_b32_e32 v1, s5
2398+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v1|
2399+
; VI-NEXT: v_mov_b32_e32 v1, s6
2400+
; VI-NEXT: v_mov_b32_e32 v2, s3
2401+
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
24022402
; VI-NEXT: s_mov_b32 s6, 0
24032403
; VI-NEXT: .LBB11_10: ; %Flow49
24042404
; VI-NEXT: s_xor_b32 s6, s6, 1
@@ -2500,11 +2500,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
25002500
; CI-NEXT: s_cbranch_vccz .LBB12_2
25012501
; CI-NEXT: ; %bb.1: ; %frem.else78
25022502
; CI-NEXT: s_and_b32 s2, s4, 0x80000000
2503-
; CI-NEXT: v_mov_b32_e32 v1, s8
2504-
; CI-NEXT: v_mov_b32_e32 v0, s4
2505-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1|
2506-
; CI-NEXT: v_mov_b32_e32 v1, s2
2507-
; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2503+
; CI-NEXT: v_mov_b32_e32 v0, s8
2504+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v0|
2505+
; CI-NEXT: v_mov_b32_e32 v0, s2
2506+
; CI-NEXT: v_mov_b32_e32 v1, s4
2507+
; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
25082508
; CI-NEXT: s_mov_b32 s2, 0
25092509
; CI-NEXT: .LBB12_2: ; %Flow127
25102510
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2575,11 +2575,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
25752575
; CI-NEXT: s_cbranch_vccz .LBB12_10
25762576
; CI-NEXT: ; %bb.9: ; %frem.else47
25772577
; CI-NEXT: s_and_b32 s2, s5, 0x80000000
2578-
; CI-NEXT: v_mov_b32_e32 v2, s9
2579-
; CI-NEXT: v_mov_b32_e32 v1, s5
2580-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2|
2581-
; CI-NEXT: v_mov_b32_e32 v2, s2
2582-
; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2578+
; CI-NEXT: v_mov_b32_e32 v1, s9
2579+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v1|
2580+
; CI-NEXT: v_mov_b32_e32 v1, s2
2581+
; CI-NEXT: v_mov_b32_e32 v2, s5
2582+
; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
25832583
; CI-NEXT: s_mov_b32 s2, 0
25842584
; CI-NEXT: .LBB12_10: ; %Flow123
25852585
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2650,11 +2650,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
26502650
; CI-NEXT: s_cbranch_vccz .LBB12_18
26512651
; CI-NEXT: ; %bb.17: ; %frem.else16
26522652
; CI-NEXT: s_and_b32 s2, s6, 0x80000000
2653-
; CI-NEXT: v_mov_b32_e32 v3, s10
2654-
; CI-NEXT: v_mov_b32_e32 v2, s6
2655-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3|
2656-
; CI-NEXT: v_mov_b32_e32 v3, s2
2657-
; CI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
2653+
; CI-NEXT: v_mov_b32_e32 v2, s10
2654+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v2|
2655+
; CI-NEXT: v_mov_b32_e32 v2, s2
2656+
; CI-NEXT: v_mov_b32_e32 v3, s6
2657+
; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
26582658
; CI-NEXT: s_mov_b32 s2, 0
26592659
; CI-NEXT: .LBB12_18: ; %Flow119
26602660
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2725,11 +2725,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
27252725
; CI-NEXT: s_cbranch_vccz .LBB12_26
27262726
; CI-NEXT: ; %bb.25: ; %frem.else
27272727
; CI-NEXT: s_and_b32 s2, s7, 0x80000000
2728-
; CI-NEXT: v_mov_b32_e32 v4, s11
2729-
; CI-NEXT: v_mov_b32_e32 v3, s7
2730-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4|
2731-
; CI-NEXT: v_mov_b32_e32 v4, s2
2732-
; CI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
2728+
; CI-NEXT: v_mov_b32_e32 v3, s11
2729+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v3|
2730+
; CI-NEXT: v_mov_b32_e32 v3, s2
2731+
; CI-NEXT: v_mov_b32_e32 v4, s7
2732+
; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
27332733
; CI-NEXT: s_mov_b32 s2, 0
27342734
; CI-NEXT: .LBB12_26: ; %Flow115
27352735
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2831,11 +2831,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
28312831
; VI-NEXT: s_cbranch_vccz .LBB12_2
28322832
; VI-NEXT: ; %bb.1: ; %frem.else78
28332833
; VI-NEXT: s_and_b32 s2, s4, 0x80000000
2834-
; VI-NEXT: v_mov_b32_e32 v1, s8
2835-
; VI-NEXT: v_mov_b32_e32 v0, s4
2836-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1|
2837-
; VI-NEXT: v_mov_b32_e32 v1, s2
2838-
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2834+
; VI-NEXT: v_mov_b32_e32 v0, s8
2835+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v0|
2836+
; VI-NEXT: v_mov_b32_e32 v0, s2
2837+
; VI-NEXT: v_mov_b32_e32 v1, s4
2838+
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
28392839
; VI-NEXT: s_mov_b32 s2, 0
28402840
; VI-NEXT: .LBB12_2: ; %Flow127
28412841
; VI-NEXT: s_xor_b32 s2, s2, 1
@@ -2906,11 +2906,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
29062906
; VI-NEXT: s_cbranch_vccz .LBB12_10
29072907
; VI-NEXT: ; %bb.9: ; %frem.else47
29082908
; VI-NEXT: s_and_b32 s2, s5, 0x80000000
2909-
; VI-NEXT: v_mov_b32_e32 v2, s9
2910-
; VI-NEXT: v_mov_b32_e32 v1, s5
2911-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2|
2912-
; VI-NEXT: v_mov_b32_e32 v2, s2
2913-
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2909+
; VI-NEXT: v_mov_b32_e32 v1, s9
2910+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v1|
2911+
; VI-NEXT: v_mov_b32_e32 v1, s2
2912+
; VI-NEXT: v_mov_b32_e32 v2, s5
2913+
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
29142914
; VI-NEXT: s_mov_b32 s2, 0
29152915
; VI-NEXT: .LBB12_10: ; %Flow123
29162916
; VI-NEXT: s_xor_b32 s2, s2, 1
@@ -2981,11 +2981,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
29812981
; VI-NEXT: s_cbranch_vccz .LBB12_18
29822982
; VI-NEXT: ; %bb.17: ; %frem.else16
29832983
; VI-NEXT: s_and_b32 s2, s6, 0x80000000
2984-
; VI-NEXT: v_mov_b32_e32 v3, s10
2985-
; VI-NEXT: v_mov_b32_e32 v2, s6
2986-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3|
2987-
; VI-NEXT: v_mov_b32_e32 v3, s2
2988-
; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
2984+
; VI-NEXT: v_mov_b32_e32 v2, s10
2985+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v2|
2986+
; VI-NEXT: v_mov_b32_e32 v2, s2
2987+
; VI-NEXT: v_mov_b32_e32 v3, s6
2988+
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
29892989
; VI-NEXT: s_mov_b32 s2, 0
29902990
; VI-NEXT: .LBB12_18: ; %Flow119
29912991
; VI-NEXT: s_xor_b32 s2, s2, 1
@@ -3056,11 +3056,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
30563056
; VI-NEXT: s_cbranch_vccz .LBB12_26
30573057
; VI-NEXT: ; %bb.25: ; %frem.else
30583058
; VI-NEXT: s_and_b32 s2, s7, 0x80000000
3059-
; VI-NEXT: v_mov_b32_e32 v4, s11
3060-
; VI-NEXT: v_mov_b32_e32 v3, s7
3061-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4|
3062-
; VI-NEXT: v_mov_b32_e32 v4, s2
3063-
; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
3059+
; VI-NEXT: v_mov_b32_e32 v3, s11
3060+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v3|
3061+
; VI-NEXT: v_mov_b32_e32 v3, s2
3062+
; VI-NEXT: v_mov_b32_e32 v4, s7
3063+
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
30643064
; VI-NEXT: s_mov_b32 s2, 0
30653065
; VI-NEXT: .LBB12_26: ; %Flow115
30663066
; VI-NEXT: s_xor_b32 s2, s2, 1

0 commit comments

Comments
 (0)