Skip to content

Commit 9f39431

Browse files
authored
Move bigval_t struct to gc-common.h and loop through GCAllocBytes uses to apply fastpath allocation for MMTk (#75)
* Refactor bigval_t struct and move it to gc-common.h * Restructuring lowerGCAllocBytesLate pass
1 parent b4c8f5d commit 9f39431

9 files changed

+149
-184
lines changed

src/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(addprefix $(SRCDIR)/,de
347347
$(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h
348348
$(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
349349
$(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
350-
$(BUILDDIR)/gc-mmtk.o $(BUILDDIR)/gc-mmtk.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-mmtk.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h
350+
$(BUILDDIR)/gc-mmtk.o $(BUILDDIR)/gc-mmtk.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h
351351
$(BUILDDIR)/gc-stacks.o $(BUILDDIR)/gc-stacks.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
352352
$(BUILDDIR)/gc-stock.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(SRCDIR)/gc-page-profiler.h
353353
$(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc-heap-snapshot.h

src/gc-common.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,31 @@
1616
extern "C" {
1717
#endif
1818

19+
// =========================================================================== //
20+
// GC Big objects
21+
// =========================================================================== //
22+
23+
JL_EXTENSION typedef struct _bigval_t {
24+
struct _bigval_t *next;
25+
struct _bigval_t *prev;
26+
size_t sz;
27+
#ifdef _P64 // Add padding so that the value is 64-byte aligned
28+
// (8 pointers of 8 bytes each) - (4 other pointers in struct)
29+
void *_padding[8 - 4];
30+
#else
31+
// (16 pointers of 4 bytes each) - (4 other pointers in struct)
32+
void *_padding[16 - 4];
33+
#endif
34+
//struct jl_taggedvalue_t <>;
35+
union {
36+
uintptr_t header;
37+
struct {
38+
uintptr_t gc:2;
39+
} bits;
40+
};
41+
// must be 64-byte aligned here, in 32 & 64 bit modes
42+
} bigval_t;
43+
1944
// =========================================================================== //
2045
// GC Callbacks
2146
// =========================================================================== //

src/gc-mmtk.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#include "gc-common.h"
22
#include "mmtkMutator.h"
3-
#include "gc-mmtk.h"
43
#include "threading.h"
54

65
// File exists in the binding

src/gc-mmtk.h

Lines changed: 0 additions & 30 deletions
This file was deleted.

src/gc-stock.h

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "julia_internal.h"
2020
#include "julia_assert.h"
2121
#include "threading.h"
22+
#include "gc-common.h"
2223

2324
#ifdef __cplusplus
2425
extern "C" {
@@ -84,27 +85,6 @@ typedef struct _jl_gc_chunk_t {
8485

8586
extern uintptr_t gc_bigval_sentinel_tag;
8687

87-
JL_EXTENSION typedef struct _bigval_t {
88-
struct _bigval_t *next;
89-
struct _bigval_t *prev;
90-
size_t sz;
91-
#ifdef _P64 // Add padding so that the value is 64-byte aligned
92-
// (8 pointers of 8 bytes each) - (4 other pointers in struct)
93-
void *_padding[8 - 4];
94-
#else
95-
// (16 pointers of 4 bytes each) - (4 other pointers in struct)
96-
void *_padding[16 - 4];
97-
#endif
98-
//struct jl_taggedvalue_t <>;
99-
union {
100-
uintptr_t header;
101-
struct {
102-
uintptr_t gc:2;
103-
} bits;
104-
};
105-
// must be 64-byte aligned here, in 32 & 64 bit modes
106-
} bigval_t;
107-
10888
// pool page metadata
10989
typedef struct _jl_gc_pagemeta_t {
11090
// next metadata structure in per-thread list

src/llvm-gc-interface-passes.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -368,9 +368,7 @@ struct LateLowerGCFrame: private JuliaPassContext {
368368
void RefineLiveSet(LargeSparseBitVector &LS, State &S, ArrayRef<int> CalleeRoots);
369369
Value *EmitTagPtr(IRBuilder<> &builder, Type *T, Type *T_size, Value *V);
370370
Value *EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V);
371-
#ifdef MMTK_GC
372371
Value* lowerGCAllocBytesLate(CallInst *target, Function &F);
373-
#endif
374372
};
375373

376374
// The final GC lowering pass. This pass lowers platform-agnostic GC

src/llvm-late-gc-lowering-mmtk.cpp

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
// This file is a part of Julia. License is MIT: https://julialang.org/license
2+
13
#include "llvm-gc-interface-passes.h"
24

35
void LateLowerGCFrame::CleanupGCPreserve(Function &F, CallInst *CI, Value *callee, Type *T_size) {
@@ -45,3 +47,99 @@ void LateLowerGCFrame::CleanupGCPreserve(Function &F, CallInst *CI, Value *calle
4547
builder.CreateCall(getOrDeclare(jl_well_known::GCPreserveEndHook), {});
4648
}
4749
}
50+
51+
Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
52+
{
53+
assert(target->arg_size() == 3);
54+
55+
IRBuilder<> builder(target);
56+
auto ptls = target->getArgOperand(0);
57+
auto type = target->getArgOperand(2);
58+
if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
59+
size_t sz = (size_t)CI->getZExtValue();
60+
// This is strongly architecture and OS dependent
61+
int osize;
62+
int offset = jl_gc_classify_pools(sz, &osize);
63+
if (offset >= 0) {
64+
// In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc
65+
// We do a slowpath/fastpath check and lower it only on the slowpath, returning
66+
// the cursor and updating it in the fastpath.
67+
auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
68+
auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);
69+
70+
// Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
71+
// Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
72+
const bool INLINE_FASTPATH_ALLOCATION = true;
73+
74+
if (INLINE_FASTPATH_ALLOCATION) {
75+
// Assuming we use the first immix allocator.
76+
// FIXME: We should get the allocator index and type from MMTk.
77+
auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);
78+
79+
auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
80+
auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit));
81+
82+
auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
83+
auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
84+
auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");
85+
86+
// offset = 8
87+
auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
88+
auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
89+
auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
90+
// alignment 16 (15 = 16 - 1)
91+
auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
92+
auto result = builder.CreateNSWAdd(cursor, delta, "result");
93+
94+
auto new_cursor = builder.CreateNSWAdd(result, pool_osize);
95+
96+
auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
97+
auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
98+
auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");
99+
100+
auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);
101+
102+
auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
103+
auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());
104+
105+
auto next_instr = target->getNextNode();
106+
SmallVector<uint32_t, 2> Weights{1, 9};
107+
108+
MDBuilder MDB(F.getContext());
109+
SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights));
110+
111+
builder.SetInsertPoint(next_instr);
112+
auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow");
113+
114+
// slowpath
115+
builder.SetInsertPoint(slowpath);
116+
auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
117+
auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
118+
new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
119+
builder.CreateBr(next_instr->getParent());
120+
121+
// fastpath
122+
builder.SetInsertPoint(fastpath);
123+
builder.CreateStore(new_cursor, cursor_ptr);
124+
125+
// ptls->gc_tls.gc_num.allocd += osize;
126+
auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num));
127+
auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
128+
auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
129+
auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
130+
auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
131+
builder.CreateStore(pool_allocd_total, pool_alloc_tls);
132+
133+
auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
134+
auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType());
135+
builder.CreateBr(next_instr->getParent());
136+
137+
phiNode->addIncoming(new_call, slowpath);
138+
phiNode->addIncoming(v_as_ptr, fastpath);
139+
phiNode->takeName(target);
140+
return phiNode;
141+
}
142+
}
143+
}
144+
return target;
145+
}

src/llvm-late-gc-lowering-stock.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
1+
// This file is a part of Julia. License is MIT: https://julialang.org/license
2+
13
#include "llvm-gc-interface-passes.h"
24

35
void LateLowerGCFrame::CleanupGCPreserve(Function &F, CallInst *CI, Value *callee, Type *T_size) {
46
// Do nothing for the stock GC
57
}
8+
9+
Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
10+
{
11+
// Do nothing for the stock GC
12+
return target;
13+
}

0 commit comments

Comments
 (0)