Skip to content

Commit b7fd27b

Browse files
committed
Allow sinking of free vector ops
1 parent f802acf commit b7fd27b

File tree

6 files changed

+998
-932
lines changed

6 files changed

+998
-932
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,6 +1301,90 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
13011301

13021302
if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
13031303
Ops.push_back(&Op);
1304+
1305+
// Zero cost vector instructions (e.g. extractelement 0 of i32 vectors)
1306+
// will be optimized away, and sinking them can help SDAG combines.
1307+
DataLayout DL = I->getModule()->getDataLayout();
1308+
auto IsFreeExtractInsert = [&DL, this](VectorType *VecType,
1309+
unsigned VecIndex) {
1310+
unsigned EltSize = DL.getTypeSizeInBits(VecType->getElementType());
1311+
return EltSize >= 32 ||
1312+
(EltSize == 16 && VecIndex == 0 && ST->has16BitInsts());
1313+
};
1314+
1315+
uint64_t VecIndex;
1316+
Value *Vec;
1317+
if (match(Op.get(), m_ExtractElt(m_Value(Vec), m_ConstantInt(VecIndex)))) {
1318+
Instruction *VecOpInst =
1319+
dyn_cast<Instruction>(cast<Instruction>(Op.get())->getOperand(0));
1320+
// If a zero cost extractvector instruction is the only use of the vector,
1321+
// then it may be combined with the def.
1322+
if (VecOpInst && VecOpInst->hasOneUse())
1323+
continue;
1324+
1325+
if (IsFreeExtractInsert(cast<VectorType>(Vec->getType()), VecIndex))
1326+
Ops.push_back(&Op);
1327+
1328+
continue;
1329+
}
1330+
1331+
if (match(Op.get(),
1332+
m_InsertElt(m_Value(Vec), m_Value(), m_ConstantInt(VecIndex)))) {
1333+
if (IsFreeExtractInsert(cast<VectorType>(Vec->getType()), VecIndex))
1334+
Ops.push_back(&Op);
1335+
1336+
continue;
1337+
}
1338+
1339+
if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
1340+
if (Shuffle->isIdentity()) {
1341+
Ops.push_back(&Op);
1342+
continue;
1343+
}
1344+
1345+
unsigned EltSize = DL.getTypeSizeInBits(
1346+
cast<VectorType>(cast<VectorType>(Shuffle->getType()))
1347+
->getElementType());
1348+
1349+
// For i32 (or greater) shufflevectors, these will be lowered into a
1350+
// series of insert / extract elements, which will be coalesced away.
1351+
if (EltSize >= 32) {
1352+
Ops.push_back(&Op);
1353+
continue;
1354+
}
1355+
1356+
if (EltSize < 16 || !ST->has16BitInsts())
1357+
continue;
1358+
1359+
int NumSubElts, SubIndex;
1360+
if (Shuffle->changesLength()) {
1361+
if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1362+
Ops.push_back(&Op);
1363+
continue;
1364+
}
1365+
1366+
if (Shuffle->isExtractSubvectorMask(SubIndex) ||
1367+
Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) {
1368+
if (!(SubIndex % 2)) {
1369+
Ops.push_back(&Op);
1370+
continue;
1371+
}
1372+
}
1373+
}
1374+
1375+
if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1376+
Shuffle->isSingleSource()) {
1377+
Ops.push_back(&Op);
1378+
continue;
1379+
}
1380+
1381+
if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) {
1382+
if (!(SubIndex % 2)) {
1383+
Ops.push_back(&Op);
1384+
continue;
1385+
}
1386+
}
1387+
}
13041388
}
13051389

13061390
return !Ops.empty();

llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll

Lines changed: 60 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -2149,11 +2149,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
21492149
; CI-NEXT: s_cbranch_vccz .LBB11_2
21502150
; CI-NEXT: ; %bb.1: ; %frem.else
21512151
; CI-NEXT: s_and_b32 s6, s2, 0x80000000
2152-
; CI-NEXT: v_mov_b32_e32 v1, s4
2153-
; CI-NEXT: v_mov_b32_e32 v0, s2
2154-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
2155-
; CI-NEXT: v_mov_b32_e32 v1, s6
2156-
; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2152+
; CI-NEXT: v_mov_b32_e32 v0, s4
2153+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v0|
2154+
; CI-NEXT: v_mov_b32_e32 v0, s6
2155+
; CI-NEXT: v_mov_b32_e32 v1, s2
2156+
; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
21572157
; CI-NEXT: s_mov_b32 s6, 0
21582158
; CI-NEXT: .LBB11_2: ; %Flow53
21592159
; CI-NEXT: s_xor_b32 s6, s6, 1
@@ -2224,11 +2224,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
22242224
; CI-NEXT: s_cbranch_vccz .LBB11_10
22252225
; CI-NEXT: ; %bb.9: ; %frem.else16
22262226
; CI-NEXT: s_and_b32 s6, s3, 0x80000000
2227-
; CI-NEXT: v_mov_b32_e32 v2, s5
2228-
; CI-NEXT: v_mov_b32_e32 v1, s3
2229-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2|
2230-
; CI-NEXT: v_mov_b32_e32 v2, s6
2231-
; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2227+
; CI-NEXT: v_mov_b32_e32 v1, s5
2228+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v1|
2229+
; CI-NEXT: v_mov_b32_e32 v1, s6
2230+
; CI-NEXT: v_mov_b32_e32 v2, s3
2231+
; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
22322232
; CI-NEXT: s_mov_b32 s6, 0
22332233
; CI-NEXT: .LBB11_10: ; %Flow49
22342234
; CI-NEXT: s_xor_b32 s6, s6, 1
@@ -2322,11 +2322,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
23222322
; VI-NEXT: s_cbranch_vccz .LBB11_2
23232323
; VI-NEXT: ; %bb.1: ; %frem.else
23242324
; VI-NEXT: s_and_b32 s6, s2, 0x80000000
2325-
; VI-NEXT: v_mov_b32_e32 v1, s4
2326-
; VI-NEXT: v_mov_b32_e32 v0, s2
2327-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
2328-
; VI-NEXT: v_mov_b32_e32 v1, s6
2329-
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2325+
; VI-NEXT: v_mov_b32_e32 v0, s4
2326+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v0|
2327+
; VI-NEXT: v_mov_b32_e32 v0, s6
2328+
; VI-NEXT: v_mov_b32_e32 v1, s2
2329+
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
23302330
; VI-NEXT: s_mov_b32 s6, 0
23312331
; VI-NEXT: .LBB11_2: ; %Flow53
23322332
; VI-NEXT: s_xor_b32 s6, s6, 1
@@ -2397,11 +2397,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
23972397
; VI-NEXT: s_cbranch_vccz .LBB11_10
23982398
; VI-NEXT: ; %bb.9: ; %frem.else16
23992399
; VI-NEXT: s_and_b32 s6, s3, 0x80000000
2400-
; VI-NEXT: v_mov_b32_e32 v2, s5
2401-
; VI-NEXT: v_mov_b32_e32 v1, s3
2402-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2|
2403-
; VI-NEXT: v_mov_b32_e32 v2, s6
2404-
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2400+
; VI-NEXT: v_mov_b32_e32 v1, s5
2401+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v1|
2402+
; VI-NEXT: v_mov_b32_e32 v1, s6
2403+
; VI-NEXT: v_mov_b32_e32 v2, s3
2404+
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
24052405
; VI-NEXT: s_mov_b32 s6, 0
24062406
; VI-NEXT: .LBB11_10: ; %Flow49
24072407
; VI-NEXT: s_xor_b32 s6, s6, 1
@@ -2503,11 +2503,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
25032503
; CI-NEXT: s_cbranch_vccz .LBB12_2
25042504
; CI-NEXT: ; %bb.1: ; %frem.else
25052505
; CI-NEXT: s_and_b32 s2, s4, 0x80000000
2506-
; CI-NEXT: v_mov_b32_e32 v1, s8
2507-
; CI-NEXT: v_mov_b32_e32 v0, s4
2508-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1|
2509-
; CI-NEXT: v_mov_b32_e32 v1, s2
2510-
; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2506+
; CI-NEXT: v_mov_b32_e32 v0, s8
2507+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v0|
2508+
; CI-NEXT: v_mov_b32_e32 v0, s2
2509+
; CI-NEXT: v_mov_b32_e32 v1, s4
2510+
; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
25112511
; CI-NEXT: s_mov_b32 s2, 0
25122512
; CI-NEXT: .LBB12_2: ; %Flow127
25132513
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2578,11 +2578,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
25782578
; CI-NEXT: s_cbranch_vccz .LBB12_10
25792579
; CI-NEXT: ; %bb.9: ; %frem.else16
25802580
; CI-NEXT: s_and_b32 s2, s5, 0x80000000
2581-
; CI-NEXT: v_mov_b32_e32 v2, s9
2582-
; CI-NEXT: v_mov_b32_e32 v1, s5
2583-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2|
2584-
; CI-NEXT: v_mov_b32_e32 v2, s2
2585-
; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2581+
; CI-NEXT: v_mov_b32_e32 v1, s9
2582+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v1|
2583+
; CI-NEXT: v_mov_b32_e32 v1, s2
2584+
; CI-NEXT: v_mov_b32_e32 v2, s5
2585+
; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
25862586
; CI-NEXT: s_mov_b32 s2, 0
25872587
; CI-NEXT: .LBB12_10: ; %Flow123
25882588
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2653,11 +2653,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
26532653
; CI-NEXT: s_cbranch_vccz .LBB12_18
26542654
; CI-NEXT: ; %bb.17: ; %frem.else47
26552655
; CI-NEXT: s_and_b32 s2, s6, 0x80000000
2656-
; CI-NEXT: v_mov_b32_e32 v3, s10
2657-
; CI-NEXT: v_mov_b32_e32 v2, s6
2658-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3|
2659-
; CI-NEXT: v_mov_b32_e32 v3, s2
2660-
; CI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
2656+
; CI-NEXT: v_mov_b32_e32 v2, s10
2657+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v2|
2658+
; CI-NEXT: v_mov_b32_e32 v2, s2
2659+
; CI-NEXT: v_mov_b32_e32 v3, s6
2660+
; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
26612661
; CI-NEXT: s_mov_b32 s2, 0
26622662
; CI-NEXT: .LBB12_18: ; %Flow119
26632663
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2728,11 +2728,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
27282728
; CI-NEXT: s_cbranch_vccz .LBB12_26
27292729
; CI-NEXT: ; %bb.25: ; %frem.else78
27302730
; CI-NEXT: s_and_b32 s2, s7, 0x80000000
2731-
; CI-NEXT: v_mov_b32_e32 v4, s11
2732-
; CI-NEXT: v_mov_b32_e32 v3, s7
2733-
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4|
2734-
; CI-NEXT: v_mov_b32_e32 v4, s2
2735-
; CI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
2731+
; CI-NEXT: v_mov_b32_e32 v3, s11
2732+
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v3|
2733+
; CI-NEXT: v_mov_b32_e32 v3, s2
2734+
; CI-NEXT: v_mov_b32_e32 v4, s7
2735+
; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
27362736
; CI-NEXT: s_mov_b32 s2, 0
27372737
; CI-NEXT: .LBB12_26: ; %Flow115
27382738
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2834,11 +2834,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
28342834
; VI-NEXT: s_cbranch_vccz .LBB12_2
28352835
; VI-NEXT: ; %bb.1: ; %frem.else
28362836
; VI-NEXT: s_and_b32 s2, s4, 0x80000000
2837-
; VI-NEXT: v_mov_b32_e32 v1, s8
2838-
; VI-NEXT: v_mov_b32_e32 v0, s4
2839-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1|
2840-
; VI-NEXT: v_mov_b32_e32 v1, s2
2841-
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
2837+
; VI-NEXT: v_mov_b32_e32 v0, s8
2838+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v0|
2839+
; VI-NEXT: v_mov_b32_e32 v0, s2
2840+
; VI-NEXT: v_mov_b32_e32 v1, s4
2841+
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
28422842
; VI-NEXT: s_mov_b32 s2, 0
28432843
; VI-NEXT: .LBB12_2: ; %Flow127
28442844
; VI-NEXT: s_xor_b32 s2, s2, 1
@@ -2909,11 +2909,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
29092909
; VI-NEXT: s_cbranch_vccz .LBB12_10
29102910
; VI-NEXT: ; %bb.9: ; %frem.else16
29112911
; VI-NEXT: s_and_b32 s2, s5, 0x80000000
2912-
; VI-NEXT: v_mov_b32_e32 v2, s9
2913-
; VI-NEXT: v_mov_b32_e32 v1, s5
2914-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2|
2915-
; VI-NEXT: v_mov_b32_e32 v2, s2
2916-
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2912+
; VI-NEXT: v_mov_b32_e32 v1, s9
2913+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v1|
2914+
; VI-NEXT: v_mov_b32_e32 v1, s2
2915+
; VI-NEXT: v_mov_b32_e32 v2, s5
2916+
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
29172917
; VI-NEXT: s_mov_b32 s2, 0
29182918
; VI-NEXT: .LBB12_10: ; %Flow123
29192919
; VI-NEXT: s_xor_b32 s2, s2, 1
@@ -2984,11 +2984,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
29842984
; VI-NEXT: s_cbranch_vccz .LBB12_18
29852985
; VI-NEXT: ; %bb.17: ; %frem.else47
29862986
; VI-NEXT: s_and_b32 s2, s6, 0x80000000
2987-
; VI-NEXT: v_mov_b32_e32 v3, s10
2988-
; VI-NEXT: v_mov_b32_e32 v2, s6
2989-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3|
2990-
; VI-NEXT: v_mov_b32_e32 v3, s2
2991-
; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
2987+
; VI-NEXT: v_mov_b32_e32 v2, s10
2988+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v2|
2989+
; VI-NEXT: v_mov_b32_e32 v2, s2
2990+
; VI-NEXT: v_mov_b32_e32 v3, s6
2991+
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
29922992
; VI-NEXT: s_mov_b32 s2, 0
29932993
; VI-NEXT: .LBB12_18: ; %Flow119
29942994
; VI-NEXT: s_xor_b32 s2, s2, 1
@@ -3059,11 +3059,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
30593059
; VI-NEXT: s_cbranch_vccz .LBB12_26
30603060
; VI-NEXT: ; %bb.25: ; %frem.else78
30613061
; VI-NEXT: s_and_b32 s2, s7, 0x80000000
3062-
; VI-NEXT: v_mov_b32_e32 v4, s11
3063-
; VI-NEXT: v_mov_b32_e32 v3, s7
3064-
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4|
3065-
; VI-NEXT: v_mov_b32_e32 v4, s2
3066-
; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
3062+
; VI-NEXT: v_mov_b32_e32 v3, s11
3063+
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v3|
3064+
; VI-NEXT: v_mov_b32_e32 v3, s2
3065+
; VI-NEXT: v_mov_b32_e32 v4, s7
3066+
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
30673067
; VI-NEXT: s_mov_b32 s2, 0
30683068
; VI-NEXT: .LBB12_26: ; %Flow115
30693069
; VI-NEXT: s_xor_b32 s2, s2, 1

0 commit comments

Comments
 (0)