Skip to content

Commit a147508

Browse files
topolarityclaude
andcommitted
gc: pretenure image objects to skip sysimage subgraph during mark phase
Load image objects as permanently marked (GC_OLD_MARKED) so gc_try_setmark_tag returns 0 immediately and the mark phase never enters the image subgraph. This reduces full-sweep mark times by 10-25x for typical workloads with loaded packages. Image objects live in separate mmap'd regions, are never freed, and are rarely mutated, making them ideal candidates for pretenuring. A persistent `image_remset` (htable) tracks image objects that have been mutated to reference non-image (collectable) objects. These are discovered at image load time by gc_scan_sysimg_remset and added incrementally by the write barrier in jl_gc_queue_root. After a full sweep (which clears per-thread remsets), gc_queue_image_remset pushes these entries to the mark queue so their children are properly traced. Quick sweeps don't need this because old objects retain their marks. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 43e9d32 commit a147508

File tree

7 files changed

+187
-41
lines changed

7 files changed

+187
-41
lines changed

src/gc-debug.c

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -745,13 +745,6 @@ void gc_time_pool_end(int sweep_full)
745745
sweep_full ? "full" : "quick");
746746
}
747747

748-
void gc_time_sysimg_end(uint64_t t0)
749-
{
750-
double sweep_pool_sec = (jl_hrtime() - t0) / 1e9;
751-
jl_safe_printf("GC sweep sysimg end %.2f ms\n",
752-
sweep_pool_sec * 1000);
753-
}
754-
755748
static int64_t big_total;
756749
static int64_t big_freed;
757750
static int64_t big_reset;

src/gc-stock.c

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1483,13 +1483,6 @@ static void gc_sweep_pool(void) JL_NOTSAFEPOINT
14831483
gc_time_pool_end(current_sweep_full);
14841484
}
14851485

1486-
static void gc_sweep_perm_alloc(void) JL_NOTSAFEPOINT
1487-
{
1488-
uint64_t t0 = jl_hrtime();
1489-
gc_sweep_sysimg();
1490-
gc_time_sysimg_end(t0);
1491-
}
1492-
14931486
// mark phase
14941487

14951488
JL_DLLEXPORT void jl_gc_queue_root(const jl_value_t *ptr)
@@ -1504,6 +1497,18 @@ JL_DLLEXPORT void jl_gc_queue_root(const jl_value_t *ptr)
15041497
if (header & GC_OLD) { // write barrier has not been triggered in this object yet
15051498
arraylist_push(&ptls->gc_tls.heap.remset, (jl_value_t*)ptr);
15061499
ptls->gc_tls.heap.remset_nptr++; // conservative
1500+
// Permanently-marked image objects that are mutated need to be
1501+
// persistently tracked, since they would otherwise be skipped
1502+
// during the mark phase. The image_remset is append-only, so
1503+
// this object will be re-scanned every GC cycle hereafter.
1504+
// Deduplication via image_remset prevents unbounded growth
1505+
// from repeated mutations of the same image object.
1506+
if (__unlikely(o->bits.in_image)) {
1507+
JL_LOCK_NOGC(&image_remset_lock);
1508+
if (ptrhash_get(&image_remset, (void*)ptr) == HT_NOTFOUND)
1509+
ptrhash_put(&image_remset, (void*)ptr, (void*)ptr);
1510+
JL_UNLOCK_NOGC(&image_remset_lock);
1511+
}
15071512
}
15081513
}
15091514

@@ -2827,6 +2832,24 @@ static void gc_queue_remset(jl_gc_markqueue_t *mq, jl_ptls_t ptls2) JL_NOTSAFEPO
28272832
ptls2->gc_tls.heap.remset_nptr = 0;
28282833
}
28292834

2835+
// Queue image objects with cross-heap references for marking.
2836+
// These are persistent (never cleared) so that image objects that reference
2837+
// non-image objects are always re-scanned, even though the image objects
2838+
// themselves are permanently marked and would otherwise be skipped.
2839+
static void gc_queue_image_remset(jl_gc_markqueue_t *mq) JL_NOTSAFEPOINT
2840+
{
2841+
size_t sz = image_remset.size;
2842+
void **table = image_remset.table;
2843+
for (size_t i = 0; i < sz; i += 2) {
2844+
void *_v = table[i];
2845+
if (_v != HT_NOTFOUND && _v != NULL) {
2846+
jl_astaggedvalue(_v)->bits.gc = GC_OLD_MARKED;
2847+
jl_value_t *v = (jl_value_t *)((uintptr_t)_v | GC_REMSET_PTR_TAG);
2848+
gc_ptr_queue_push(mq, v);
2849+
}
2850+
}
2851+
}
2852+
28302853
static void gc_check_all_remsets_are_empty(void) JL_NOTSAFEPOINT
28312854
{
28322855
for (int i = 0; i < gc_n_threads; i++) {
@@ -3086,6 +3109,12 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) JL_NOTS
30863109
}
30873110
}
30883111
gc_check_all_remsets_are_empty();
3112+
// 1.4. queue image objects with cross-heap references.
3113+
// Only needed after a full sweep (which clears non-image objects'
3114+
// mark bits). After quick sweeps, old objects retain their marks,
3115+
// so children of image_remset entries survive without re-tracing.
3116+
if (prev_sweep_full)
3117+
gc_queue_image_remset(mq);
30893118

30903119
// 2. walk roots
30913120
gc_mark_roots(mq);
@@ -3213,8 +3242,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) JL_NOTS
32133242
gc_scrub();
32143243
gc_verify_tags();
32153244
gc_sweep_pool();
3216-
if (sweep_full)
3217-
gc_sweep_perm_alloc();
32183245
}
32193246

32203247
JL_PROBE_GC_SWEEP_END();
@@ -3737,6 +3764,8 @@ void jl_gc_init(void)
37373764
{
37383765
JL_MUTEX_INIT(&heapsnapshot_lock, "heapsnapshot_lock");
37393766
JL_MUTEX_INIT(&finalizers_lock, "finalizers_lock");
3767+
JL_MUTEX_INIT(&image_remset_lock, "image_remset_lock");
3768+
htable_new(&image_remset, 0);
37403769
uv_mutex_init(&page_profile_lock);
37413770
uv_mutex_init(&gc_perm_lock);
37423771
uv_mutex_init(&gc_pages_lock);

src/gc-stock.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -607,8 +607,6 @@ void gc_final_pause_end(int64_t t0, int64_t tend);
607607
void gc_time_pool_start(void) JL_NOTSAFEPOINT;
608608
void gc_time_count_page(int freedall, int pg_skpd) JL_NOTSAFEPOINT;
609609
void gc_time_pool_end(int sweep_full) JL_NOTSAFEPOINT;
610-
void gc_time_sysimg_end(uint64_t t0) JL_NOTSAFEPOINT;
611-
612610
void gc_time_big_start(void) JL_NOTSAFEPOINT;
613611
void gc_time_count_big(int old_bits, int bits) JL_NOTSAFEPOINT;
614612
void gc_time_big_end(void) JL_NOTSAFEPOINT;
@@ -641,7 +639,6 @@ STATIC_INLINE void gc_time_count_page(int freedall, int pg_skpd) JL_NOTSAFEPOINT
641639
(void)pg_skpd;
642640
}
643641
#define gc_time_pool_end(sweep_full) (void)(sweep_full)
644-
#define gc_time_sysimg_end(t0) (void)(t0)
645642
#define gc_time_big_start()
646643
STATIC_INLINE void gc_time_count_big(int old_bits, int bits) JL_NOTSAFEPOINT
647644
{

src/gc-wb-stock.h

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@ extern "C" {
1414
STATIC_INLINE void jl_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT
1515
{
1616
// parent and ptr isa jl_value_t*
17-
if (__unlikely(jl_astaggedvalue(parent)->bits.gc == 3 /* GC_OLD_MARKED */ && // parent is old and not in remset
18-
(jl_astaggedvalue(ptr)->bits.gc & 1 /* GC_MARKED */) == 0)) // ptr is young
17+
if (__unlikely(jl_astaggedvalue(parent)->bits.gc == 3 /* GC_OLD_MARKED */ &&
18+
(jl_astaggedvalue(parent)->bits.in_image || // image parents are never fully traced
19+
(jl_astaggedvalue(ptr)->bits.gc & 1 /* GC_MARKED */) == 0))) // ptr is young
1920
jl_gc_queue_root((jl_value_t*)parent);
2021
}
2122

@@ -33,7 +34,7 @@ STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_
3334
// ptr is an immutable object
3435
if (__likely(jl_astaggedvalue(parent)->bits.gc != 3))
3536
return; // parent is young or in remset
36-
if (__likely(jl_astaggedvalue(ptr)->bits.gc == 3))
37+
if (__likely(jl_astaggedvalue(ptr)->bits.gc == 3 && !jl_astaggedvalue(parent)->bits.in_image))
3738
return; // ptr is old and not in remset (thus it does not point to young)
3839
jl_datatype_t *dt = (jl_datatype_t*)jl_typeof(ptr);
3940
const jl_datatype_layout_t *ly = dt->layout;
@@ -48,13 +49,14 @@ STATIC_INLINE void jl_gc_wb_genericmemory_copy_boxed(const jl_value_t *dest_owne
4849
if (__unlikely(jl_astaggedvalue(dest_owner)->bits.gc == 3 /* GC_OLD_MARKED */ )) {
4950
jl_value_t *src_owner = jl_genericmemory_owner(src);
5051
size_t done = 0;
51-
if (jl_astaggedvalue(src_owner)->bits.gc != 3 /* GC_OLD_MARKED */) {
52+
int in_image = jl_astaggedvalue(dest_owner)->bits.in_image;
53+
if (in_image || jl_astaggedvalue(src_owner)->bits.gc != 3 /* GC_OLD_MARKED */) {
5254
if (dest_p < src_p || dest_p > src_p + (*n)) {
5355
for (; done < (*n); done++) { // copy forwards
5456
void *val = jl_atomic_load_relaxed(src_p + done);
5557
jl_atomic_store_release(dest_p + done, val);
56-
// `val` is young or old-unmarked
57-
if (val && !(jl_astaggedvalue(val)->bits.gc & 1 /* GC_MARKED */)) {
58+
// `val` is young or old-unmarked (or dest is image and val is non-image)
59+
if (val && (in_image || !(jl_astaggedvalue(val)->bits.gc & 1 /* GC_MARKED */))) {
5860
jl_gc_queue_root(dest_owner);
5961
break;
6062
}
@@ -66,8 +68,8 @@ STATIC_INLINE void jl_gc_wb_genericmemory_copy_boxed(const jl_value_t *dest_owne
6668
for (; done < (*n); done++) { // copy backwards
6769
void *val = jl_atomic_load_relaxed(src_p + (*n) - done - 1);
6870
jl_atomic_store_release(dest_p + (*n) - done - 1, val);
69-
// `val` is young or old-unmarked
70-
if (val && !(jl_astaggedvalue(val)->bits.gc & 1 /* GC_MARKED */)) {
71+
// `val` is young or old-unmarked (or dest is image and val is non-image)
72+
if (val && (in_image || !(jl_astaggedvalue(val)->bits.gc & 1 /* GC_MARKED */))) {
7173
jl_gc_queue_root(dest_owner);
7274
break;
7375
}
@@ -84,7 +86,8 @@ STATIC_INLINE void jl_gc_wb_genericmemory_copy_ptr(const jl_value_t *owner, jl_g
8486
if (__unlikely(jl_astaggedvalue(owner)->bits.gc == 3 /* GC_OLD_MARKED */)) {
8587
jl_value_t *src_owner = jl_genericmemory_owner(src);
8688
size_t elsz = dt->layout->size;
87-
if (jl_astaggedvalue(src_owner)->bits.gc != 3 /* GC_OLD_MARKED */) {
89+
if (jl_astaggedvalue(owner)->bits.in_image ||
90+
jl_astaggedvalue(src_owner)->bits.gc != 3 /* GC_OLD_MARKED */) {
8891
dt = (jl_datatype_t*)jl_tparam1(dt);
8992
for (size_t done = 0; done < n; done++) { // copy forwards
9093
char* s = (char*)src_p+done*elsz;

src/julia_internal.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -534,8 +534,9 @@ jl_value_t *jl_gc_small_alloc_noinline(jl_ptls_t ptls, int offset,
534534
int osize);
535535
jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz);
536536
JL_DLLEXPORT int jl_gc_classify_pools(size_t sz, int *osize) JL_NOTSAFEPOINT;
537-
void gc_sweep_sysimg(void) JL_NOTSAFEPOINT;
538-
537+
void gc_scan_sysimg_remset(void) JL_NOTSAFEPOINT;
538+
extern htable_t image_remset;
539+
extern jl_mutex_t image_remset_lock;
539540

540541
// pools are 16376 bytes large (GC_POOL_SZ - GC_PAGE_OFFSET)
541542
static const int jl_gc_sizeclasses[] = {

src/llvm-final-gc-lowering-stock.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,16 @@ void FinalLowerGC::lowerWriteBarrier(CallInst *target, Function &F) {
5252
auto parent = target->getArgOperand(0);
5353
IRBuilder<> builder(target);
5454
builder.SetCurrentDebugLocation(target->getDebugLoc());
55-
auto parBits = builder.CreateAnd(EmitLoadTag(builder, T_size, parent, tbaa_tag), GC_OLD_MARKED, "parent_bits");
55+
auto parTag = EmitLoadTag(builder, T_size, parent, tbaa_tag);
56+
auto parBits = builder.CreateAnd(parTag, GC_OLD_MARKED, "parent_bits");
5657
auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, GC_OLD_MARKED), "parent_old_marked");
5758
auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, target, false);
5859
builder.SetInsertPoint(mayTrigTerm);
5960
mayTrigTerm->getParent()->setName("may_trigger_wb");
61+
// Image parents are never fully traced by the mark phase, so we must
62+
// always trigger the write barrier regardless of the child's mark bits.
63+
auto parInImage = builder.CreateAnd(parTag, ConstantInt::get(T_size, GC_IN_IMAGE), "parent_in_image");
64+
auto parIsImage = builder.CreateICmpNE(parInImage, ConstantInt::get(T_size, 0), "parent_is_image");
6065
Value *anyChldNotMarked = NULL;
6166
for (unsigned i = 1; i < target->arg_size(); i++) {
6267
Value *child = target->getArgOperand(i);
@@ -65,9 +70,10 @@ void FinalLowerGC::lowerWriteBarrier(CallInst *target, Function &F) {
6570
anyChldNotMarked = anyChldNotMarked ? builder.CreateOr(anyChldNotMarked, chldNotMarked) : chldNotMarked;
6671
}
6772
assert(anyChldNotMarked); // handled by all_of test above
73+
auto shouldTrigger = builder.CreateOr(parIsImage, anyChldNotMarked, "should_trigger_wb");
6874
MDBuilder MDB(parent->getContext());
6975
SmallVector<uint32_t, 2> Weights{1, 9};
70-
auto trigTerm = SplitBlockAndInsertIfThen(anyChldNotMarked, mayTrigTerm, false,
76+
auto trigTerm = SplitBlockAndInsertIfThen(shouldTrigger, mayTrigTerm, false,
7177
MDB.createBranchWeights(Weights));
7278
trigTerm->getParent()->setName("trigger_wb");
7379
builder.SetInsertPoint(trigTerm);

0 commit comments

Comments
 (0)