Skip to content

Commit 39283bc

Browse files
committed
[SeparateConstOffsetFromGEP] Sink constant offset in GEP chain to tail.
Summary: Sink constant offsets down the GEP chain to the tail helps reduce register usage. For example: %gep0 = getelementptr half, ptr addrspace(3) %ptr, i32 512 %gep1 = getelementptr half, ptr addrspace(3) %gep0, i32 %ofst0 %gep2 = getelementptr half, ptr addrspace(3) %gep1, i32 %ofst1 %data = load half, ptr addrspace(3) %gep2, align 2 ==> %gep0 = getelementptr half, ptr addrspace(3) %ptr, i32 %ofst0 %gep1 = getelementptr half, ptr addrspace(3) %gep0, i32 %ofst1 %gep2 = getelementptr half, ptr addrspace(3) %gep1, i32 512 %data = load half, ptr addrspace(3) %gep2, align 2
1 parent 310ed2b commit 39283bc

File tree

9 files changed

+762
-605
lines changed

9 files changed

+762
-605
lines changed

llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,22 @@ class SeparateConstOffsetFromGEP {
456456
/// A helper that reunites sexts in an instruction.
457457
bool reuniteExts(Instruction *I);
458458

459+
/// Sink constant offset in a GEP chain to tail. For example,
460+
/// %gep0 = getelementptr half, ptr addrspace(3) %ptr, i32 512
461+
/// %gep1 = getelementptr half, ptr addrspace(3) %gep0, i32 %ofst0
462+
/// %gep2 = getelementptr half, ptr addrspace(3) %gep1, i32 %ofst1
463+
/// %data = load half, ptr addrspace(3) %gep2, align 2
464+
/// ==>
465+
/// %gep0 = getelementptr half, ptr addrspace(3) %ptr, i32 %ofst0
466+
/// %gep1 = getelementptr half, ptr addrspace(3) %gep0, i32 %ofst1
467+
/// %gep2 = getelementptr half, ptr addrspace(3) %gep1, i32 512
468+
/// %data = load half, ptr addrspace(3) %gep2, align 2
469+
bool sinkGEPConstantOffset(Function &F);
470+
471+
/// A helper that does sink action for a root in a gep chain.
472+
/// Return true if Ptr is a candidate for upper GEP in recursive calling.
473+
bool sinkGEPConstantOffset(Value *Ptr, bool &Changed);
474+
459475
/// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
460476
Instruction *findClosestMatchingDominator(
461477
ExprKey Key, Instruction *Dominatee,
@@ -1255,6 +1271,8 @@ bool SeparateConstOffsetFromGEP::run(Function &F) {
12551271

12561272
Changed |= reuniteExts(F);
12571273

1274+
Changed |= sinkGEPConstantOffset(F);
1275+
12581276
if (VerifyNoDeadCode)
12591277
verifyNoDeadCode(F);
12601278

@@ -1344,6 +1362,133 @@ bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
13441362
return Changed;
13451363
}
13461364

1365+
bool SeparateConstOffsetFromGEP::sinkGEPConstantOffset(Value *Ptr,
1366+
bool &Changed) {
1367+
// The purpose of this function is to sink the constant offsets in the GEP
1368+
// chain to the tail of the chain.
1369+
// This algorithm is implemented recursively, the algorithm starts from the
1370+
// tail of the chain through the DFS method and shifts the constant offset
1371+
// of the GEP step by step upwards by bottom-up DFS method, i.e. step by step
1372+
// down to the tail.
1373+
// A simple example is given:
1374+
/// %gep0 = getelementptr half, ptr addrspace(3) %ptr, i32 512
1375+
/// %gep1 = getelementptr half, ptr addrspace(3) %gep0, i32 %ofst0
1376+
/// %gep2 = getelementptr half, ptr addrspace(3) %gep1, i32 %ofst1
1377+
/// %data = load half, ptr addrspace(3) %gep2, align 2
1378+
/// ==>
1379+
/// %gep0 = getelementptr half, ptr addrspace(3) %ptr, i32 %ofst0
1380+
/// %gep1 = getelementptr half, ptr addrspace(3) %gep0, i32 %ofst1
1381+
/// %gep2 = getelementptr half, ptr addrspace(3) %gep1, i32 512
1382+
/// %data = load half, ptr addrspace(3) %gep2, align 2
1383+
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
1384+
if (!GEP)
1385+
return false;
1386+
1387+
bool BaseResult = sinkGEPConstantOffset(GEP->getPointerOperand(), Changed);
1388+
1389+
if (GEP->getNumIndices() != 1)
1390+
return false;
1391+
1392+
ConstantInt *C = nullptr;
1393+
Value *Idx = GEP->getOperand(1);
1394+
bool MatchConstant = match(Idx, m_ConstantInt(C));
1395+
1396+
if (!BaseResult)
1397+
return MatchConstant;
1398+
1399+
Type *ResTy = GEP->getResultElementType();
1400+
GetElementPtrInst *BaseGEP =
1401+
dyn_cast<GetElementPtrInst>(GEP->getPointerOperand());
1402+
assert(BaseGEP);
1403+
Value *BaseIdx = BaseGEP->getOperand(1);
1404+
Type *BaseResTy = BaseGEP->getResultElementType();
1405+
1406+
if (MatchConstant) {
1407+
// %gep0 = getelementptr half, ptr addrspace(3) %ptr, i32 8
1408+
// %gep1 = getelementptr half, ptr addrspace(3) %gep0, i32 4
1409+
// as:
1410+
// %gep1 = getelementptr half, ptr addrspace(3) %ptr, i32 12
1411+
Type *NewResTy = nullptr;
1412+
Constant *NewIdx = nullptr;
1413+
if (ResTy == BaseResTy) {
1414+
NewResTy = ResTy;
1415+
int64_t NewIdxValue = cast<ConstantInt>(BaseIdx)->getSExtValue() +
1416+
cast<ConstantInt>(Idx)->getSExtValue();
1417+
Type *NewIdxType = (NewIdxValue < std::numeric_limits<int32_t>::min() ||
1418+
NewIdxValue > std::numeric_limits<int32_t>::max())
1419+
? Type::getInt64Ty(GEP->getContext())
1420+
: Type::getInt32Ty(GEP->getContext());
1421+
NewIdx = ConstantInt::get(NewIdxType, NewIdxValue);
1422+
} else {
1423+
NewResTy = Type::getInt8Ty(GEP->getContext());
1424+
int64_t NewIdxValue = (cast<ConstantInt>(BaseIdx)->getSExtValue() *
1425+
DL->getTypeAllocSize(BaseResTy)) +
1426+
(cast<ConstantInt>(Idx)->getSExtValue() *
1427+
DL->getTypeAllocSize(ResTy));
1428+
Type *NewIdxType = (NewIdxValue < std::numeric_limits<int32_t>::min() ||
1429+
NewIdxValue > std::numeric_limits<int32_t>::max())
1430+
? Type::getInt64Ty(GEP->getContext())
1431+
: Type::getInt32Ty(GEP->getContext());
1432+
NewIdx = ConstantInt::get(NewIdxType, NewIdxValue);
1433+
}
1434+
assert(NewResTy);
1435+
assert(NewIdx);
1436+
auto *NewGEP = GetElementPtrInst::Create(
1437+
NewResTy, BaseGEP->getPointerOperand(), NewIdx);
1438+
NewGEP->setIsInBounds(GEP->isInBounds());
1439+
NewGEP->insertBefore(GEP->getIterator());
1440+
NewGEP->takeName(GEP);
1441+
1442+
GEP->replaceAllUsesWith(NewGEP);
1443+
GEP->eraseFromParent();
1444+
1445+
Changed = true;
1446+
return true;
1447+
}
1448+
1449+
// %gep0 = getelementptr half, ptr addrspace(3) %ptr, i32 8
1450+
// %gep1 = getelementptr half, ptr addrspace(3) %gep0, i32 %idx
1451+
// as:
1452+
// %gepx0 = getelementptr half, ptr addrspace(3) %ptr, i32 %idx
1453+
// %gepx1 = getelementptr half, ptr addrspace(3) %gepx0, i32 8
1454+
auto *GEPX0 =
1455+
GetElementPtrInst::Create(ResTy, BaseGEP->getPointerOperand(), Idx);
1456+
GEPX0->setIsInBounds(BaseGEP->isInBounds());
1457+
GEPX0->insertBefore(GEP->getIterator());
1458+
auto *GEPX1 = GetElementPtrInst::Create(BaseResTy, GEPX0, BaseIdx);
1459+
GEPX1->setIsInBounds(GEP->isInBounds());
1460+
GEPX1->insertBefore(GEP->getIterator());
1461+
GEPX1->takeName(GEP);
1462+
1463+
GEP->replaceAllUsesWith(GEPX1);
1464+
GEP->eraseFromParent();
1465+
1466+
Changed = true;
1467+
return true;
1468+
}
1469+
1470+
bool SeparateConstOffsetFromGEP::sinkGEPConstantOffset(Function &F) {
1471+
bool Changed = false;
1472+
SmallVector<Value *, 4> Candidates;
1473+
for (BasicBlock &B : F) {
1474+
for (Instruction &I : B) {
1475+
Value *Ptr = nullptr;
1476+
if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
1477+
Ptr = LI->getPointerOperand();
1478+
} else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
1479+
Ptr = SI->getPointerOperand();
1480+
}
1481+
if (Ptr)
1482+
Candidates.push_back(Ptr);
1483+
}
1484+
}
1485+
1486+
for (Value *Ptr : Candidates)
1487+
sinkGEPConstantOffset(Ptr, Changed);
1488+
1489+
return Changed;
1490+
}
1491+
13471492
void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
13481493
for (BasicBlock &B : F) {
13491494
for (Instruction &I : B) {

llvm/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,3 @@ fooo.exit: ; preds = %do.body.i
4545
store i32 %nb.1.i, ptr %k0, align 4
4646
br label %do.body.i.backedge
4747
}
48-

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,15 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in,
2121
; GCN-NEXT: ; iglp_opt mask(0x00000000)
2222
; GCN-NEXT: s_waitcnt lgkmcnt(0)
2323
; GCN-NEXT: v_add_u32_e32 v1, s0, v0
24-
; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1
25-
; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:57456
26-
; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:57440
27-
; GCN-NEXT: ds_read_b128 a[20:23], v2 offset:57424
28-
; GCN-NEXT: ds_read_b128 a[16:19], v2 offset:57408
29-
; GCN-NEXT: ds_read_b128 a[0:3], v2 offset:57344
30-
; GCN-NEXT: ds_read_b128 a[4:7], v2 offset:57360
31-
; GCN-NEXT: ds_read_b128 a[8:11], v2 offset:57376
32-
; GCN-NEXT: ds_read_b128 a[12:15], v2 offset:57392
24+
; GCN-NEXT: v_add_u32_e32 v2, 0x14000, v1
25+
; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:112
26+
; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:96
27+
; GCN-NEXT: ds_read_b128 a[20:23], v2 offset:80
28+
; GCN-NEXT: ds_read_b128 a[16:19], v2 offset:64
29+
; GCN-NEXT: ds_read_b128 a[0:3], v2
30+
; GCN-NEXT: ds_read_b128 a[4:7], v2 offset:16
31+
; GCN-NEXT: ds_read_b128 a[8:11], v2 offset:32
32+
; GCN-NEXT: ds_read_b128 a[12:15], v2 offset:48
3333
; GCN-NEXT: v_mov_b32_e32 v2, 1.0
3434
; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49264
3535
; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49248
@@ -199,17 +199,17 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias
199199
; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:49184
200200
; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:49168
201201
; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:49152
202-
; GCN-NEXT: v_add_u32_e32 v1, 0x6000, v1
202+
; GCN-NEXT: v_add_u32_e32 v1, 0x14000, v1
203203
; GCN-NEXT: s_waitcnt lgkmcnt(0)
204204
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
205-
; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:57456
206-
; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:57440
207-
; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:57424
208-
; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:57408
209-
; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:57344
210-
; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:57360
211-
; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:57376
212-
; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:57392
205+
; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:112
206+
; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:96
207+
; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:80
208+
; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:64
209+
; GCN-NEXT: ds_read_b128 a[32:35], v1
210+
; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:16
211+
; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:32
212+
; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:48
213213
; GCN-NEXT: s_waitcnt lgkmcnt(0)
214214
; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
215215
; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112

0 commit comments

Comments
 (0)