Skip to content

Commit 9dc4448

Browse files
committed
Enabling fastpath allocation
1 parent 7cc64d5 commit 9dc4448

File tree

2 files changed

+160
-118
lines changed

2 files changed

+160
-118
lines changed

src/llvm-final-gc-lowering.cpp

Lines changed: 21 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ void FinalLowerGC::lowerNewGCFrame(CallInst *target, Function &F)
103103
builder.CreateMemSet(gcframe, Constant::getNullValue(Type::getInt8Ty(F.getContext())), ptrsize * (nRoots + 2), Align(16), tbaa_gcframe);
104104

105105
target->replaceAllUsesWith(gcframe);
106+
target->eraseFromParent();
106107
}
107108

108109
void FinalLowerGC::lowerPushGCFrame(CallInst *target, Function &F)
@@ -130,6 +131,7 @@ void FinalLowerGC::lowerPushGCFrame(CallInst *target, Function &F)
130131
gcframe,
131132
pgcstack,
132133
Align(sizeof(void*)));
134+
target->eraseFromParent();
133135
}
134136

135137
void FinalLowerGC::lowerPopGCFrame(CallInst *target, Function &F)
@@ -148,6 +150,7 @@ void FinalLowerGC::lowerPopGCFrame(CallInst *target, Function &F)
148150
pgcstack,
149151
Align(sizeof(void*)));
150152
inst->setMetadata(LLVMContext::MD_tbaa, tbaa_gcframe);
153+
target->eraseFromParent();
151154
}
152155

153156
void FinalLowerGC::lowerGetGCFrameSlot(CallInst *target, Function &F)
@@ -167,6 +170,7 @@ void FinalLowerGC::lowerGetGCFrameSlot(CallInst *target, Function &F)
167170
auto gep = builder.CreateInBoundsGEP(T_prjlvalue, gcframe, index);
168171
gep->takeName(target);
169172
target->replaceAllUsesWith(gep);
173+
target->eraseFromParent();
170174
}
171175

172176
void FinalLowerGC::lowerQueueGCRoot(CallInst *target, Function &F)
@@ -183,6 +187,7 @@ void FinalLowerGC::lowerSafepoint(CallInst *target, Function &F)
183187
IRBuilder<> builder(target);
184188
Value* signal_page = target->getOperand(0);
185189
builder.CreateLoad(T_size, signal_page, true);
190+
target->eraseFromParent();
186191
}
187192

188193
#ifdef MMTK_GC
@@ -209,7 +214,6 @@ void FinalLowerGC::lowerWriteBarrier2Slow(CallInst *target, Function &F)
209214
assert(target->arg_size() == 2);
210215
target->setCalledFunction(writeBarrier2SlowFunc);
211216
}
212-
213217
#endif
214218

215219
void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
@@ -235,112 +239,26 @@ void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
235239
derefBytes = sz;
236240
}
237241
else {
238-
#ifndef MMTK_GC
239242
auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), offset);
240243
auto pool_osize = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
241244
newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize, type });
242245
if (sz > 0)
243246
derefBytes = sz;
244-
#else // MMTK_GC
245-
auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
246-
auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);
247-
248-
// Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
249-
// Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
250-
const bool INLINE_FASTPATH_ALLOCATION = true;
251-
252-
if (INLINE_FASTPATH_ALLOCATION) {
253-
// Assuming we use the first immix allocator.
254-
// FIXME: We should get the allocator index and type from MMTk.
255-
auto allocator_offset = offsetof(jl_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);
256-
257-
auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
258-
auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit));
259-
260-
auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
261-
auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
262-
auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");
263-
264-
// offset = 8
265-
auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
266-
auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
267-
auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
268-
// alignment 16 (15 = 16 - 1)
269-
auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
270-
auto result = builder.CreateNSWAdd(cursor, delta, "result");
271-
272-
auto new_cursor = builder.CreateNSWAdd(result, pool_osize);
273-
274-
auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
275-
auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
276-
auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");
277-
278-
auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);
279-
280-
auto current_block = target->getParent();
281-
builder.SetInsertPoint(target->getNextNode());
282-
auto phiNode = builder.CreatePHI(poolAllocFunc->getReturnType(), 2, "phi_fast_slow");
283-
auto top_cont = current_block->splitBasicBlock(target->getNextNode(), "top_cont");
284-
285-
auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
286-
auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction(), top_cont);
287-
288-
auto next_br = current_block->getTerminator();
289-
next_br->eraseFromParent();
290-
builder.SetInsertPoint(current_block);
291-
builder.CreateCondBr(gt_limit, slowpath, fastpath);
292-
293-
// slowpath
294-
builder.SetInsertPoint(slowpath);
295-
auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
296-
auto new_call = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
297-
new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
298-
builder.CreateBr(top_cont);
299-
300-
// // fastpath
301-
builder.SetInsertPoint(fastpath);
302-
builder.CreateStore(new_cursor, cursor_ptr);
303-
304-
// ptls->gc_num.allocd += osize;
305-
auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, gc_num));
306-
auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
307-
auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
308-
auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
309-
auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
310-
builder.CreateStore(pool_allocd_total, pool_alloc_tls);
311-
312-
auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
313-
auto v_as_ptr = builder.CreateIntToPtr(v_raw, poolAllocFunc->getReturnType());
314-
builder.CreateBr(top_cont);
315-
316-
phiNode->addIncoming(new_call, slowpath);
317-
phiNode->addIncoming(v_as_ptr, fastpath);
318-
phiNode->takeName(target);
319-
320-
target->replaceAllUsesWith(phiNode);
321-
return;
322-
} else {
323-
auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
324-
newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
325-
if (sz > 0)
326-
derefBytes = sz;
327-
}
328-
#endif // MMTK_GC
329247
}
330248
} else {
331249
auto size = builder.CreateZExtOrTrunc(target->getArgOperand(1), T_size);
332250
// allocTypedFunc does not include the type tag in the allocation size!
333251
newI = builder.CreateCall(allocTypedFunc, { ptls, size, type });
334252
derefBytes = sizeof(void*);
335253
}
336-
337254
newI->setAttributes(newI->getCalledFunction()->getAttributes());
338255
unsigned align = std::max((unsigned)target->getRetAlign().valueOrOne().value(), (unsigned)sizeof(void*));
339256
newI->addRetAttr(Attribute::getWithAlignment(F.getContext(), Align(align)));
340257
if (derefBytes > 0)
341258
newI->addDereferenceableRetAttr(derefBytes);
342259
newI->takeName(target);
343260
target->replaceAllUsesWith(newI);
261+
target->eraseFromParent();
344262
}
345263

346264
bool FinalLowerGC::runOnFunction(Function &F)
@@ -362,63 +280,48 @@ bool FinalLowerGC::runOnFunction(Function &F)
362280
poolAllocFunc = getOrDeclare(jl_well_known::GCPoolAlloc);
363281
bigAllocFunc = getOrDeclare(jl_well_known::GCBigAlloc);
364282
allocTypedFunc = getOrDeclare(jl_well_known::GCAllocTyped);
365-
T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext());
366-
367283
#ifdef MMTK_GC
368284
writeBarrier1Func = getOrDeclare(jl_well_known::GCWriteBarrier1);
369285
writeBarrier2Func = getOrDeclare(jl_well_known::GCWriteBarrier2);
370286
writeBarrier1SlowFunc = getOrDeclare(jl_well_known::GCWriteBarrier1Slow);
371287
writeBarrier2SlowFunc = getOrDeclare(jl_well_known::GCWriteBarrier2Slow);
372288
#endif
289+
T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext());
373290

374291
// Lower all calls to supported intrinsics.
375292
for (auto &BB : F) {
376-
for (auto it = BB.begin(); it != BB.end();) {
377-
auto *CI = dyn_cast<CallInst>(&*it);
378-
if (!CI) {
379-
++it;
293+
for (auto &I : make_early_inc_range(BB)) {
294+
auto *CI = dyn_cast<CallInst>(&I);
295+
if (!CI)
380296
continue;
381-
}
382297

383298
Value *callee = CI->getCalledOperand();
384299
assert(callee);
385300

386301
#define LOWER_INTRINSIC(INTRINSIC, LOWER_INTRINSIC_FUNC) \
387-
auto INTRINSIC = getOrNull(jl_intrinsics::INTRINSIC); \
388-
if (INTRINSIC == callee) { \
389-
LOWER_INTRINSIC_FUNC(CI, F); \
390-
it = CI->eraseFromParent(); \
391-
continue; \
392-
} \
302+
do { \
303+
auto intrinsic = getOrNull(jl_intrinsics::INTRINSIC); \
304+
if (intrinsic == callee) { \
305+
LOWER_INTRINSIC_FUNC(CI, F); \
306+
} \
307+
} while (0)
393308

394309
LOWER_INTRINSIC(newGCFrame, lowerNewGCFrame);
395310
LOWER_INTRINSIC(pushGCFrame, lowerPushGCFrame);
396311
LOWER_INTRINSIC(popGCFrame, lowerPopGCFrame);
397312
LOWER_INTRINSIC(getGCFrameSlot, lowerGetGCFrameSlot);
398313
LOWER_INTRINSIC(GCAllocBytes, lowerGCAllocBytes);
314+
LOWER_INTRINSIC(queueGCRoot, lowerQueueGCRoot);
399315
LOWER_INTRINSIC(safepoint, lowerSafepoint);
400316

401-
// These lowerings preserve the CI and do not erase them from the parent
402-
#define LOWER_WB_INTRINSIC(INTRINSIC, LOWER_INTRINSIC_FUNC) \
403-
auto INTRINSIC = getOrNull(jl_intrinsics::INTRINSIC); \
404-
if (INTRINSIC == callee) { \
405-
LOWER_INTRINSIC_FUNC(CI, F); \
406-
++it; \
407-
continue; \
408-
} \
409-
410-
LOWER_WB_INTRINSIC(queueGCRoot, lowerQueueGCRoot);
411-
412317
#ifdef MMTK_GC
413-
LOWER_WB_INTRINSIC(writeBarrier1, lowerWriteBarrier1);
414-
LOWER_WB_INTRINSIC(writeBarrier2, lowerWriteBarrier2);
415-
LOWER_WB_INTRINSIC(writeBarrier1Slow, lowerWriteBarrier1Slow);
416-
LOWER_WB_INTRINSIC(writeBarrier2Slow, lowerWriteBarrier2Slow);
318+
LOWER_INTRINSIC(writeBarrier1, lowerNewGCFrame);
319+
LOWER_INTRINSIC(writeBarrier2, lowerNewGCFrame);
320+
LOWER_INTRINSIC(writeBarrier1Slow, lowerNewGCFrame);
321+
LOWER_INTRINSIC(writeBarrier2Slow, lowerNewGCFrame);
417322
#endif
418-
++it;
419323

420324
#undef LOWER_INTRINSIC
421-
#undef LOWER_WB_INTRINSIC
422325
}
423326
}
424327

0 commit comments

Comments
 (0)