2121// enable the "mark alive" pass of GC
2222#define GC_ENABLE_MARK_ALIVE 1
2323
24+ // if true, enable the use of "prefetch" CPU instructions
25+ #define GC_ENABLE_PREFETCH_INSTRUCTIONS 1
26+
2427// include additional roots in "mark alive" pass
2528#define GC_MARK_ALIVE_EXTRA_ROOTS 1
2629
@@ -464,29 +467,75 @@ gc_maybe_untrack(PyObject *op)
464467}
465468
466469#ifdef GC_ENABLE_MARK_ALIVE
470+
471+ // prefetch buffer and stack //////////////////////////////////
472+
473+ // The buffer is a circular FIFO queue of PyObject pointers. We take
474+ // care to not dereference these pointers until they are taken out of
475+ // the buffer. A prefetch CPU instruction is issued when a pointer is
476+ // put into the buffer. If all is working as expected, there will be
477+ // enough time between the enqueue and dequeue so that the needed memory
478+ // for the object, most importantly ob_gc_bits and ob_type words, will
479+ // already be in the CPU cache.
480+ #define BUFFER_SIZE 256
481+ #define BUFFER_HI 16
482+ #define BUFFER_LO 8
483+
484+ #if !(defined(__GNUC__ ) || defined(__clang__ ))
485+ #undef GC_ENABLE_PREFETCH_INSTRUCTIONS
486+ #endif
487+
488+ #ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
489+ #define prefetch (ptr ) __builtin_prefetch(ptr, 1, 3)
490+ #else
491+ #define prefetch (ptr )
492+ #endif
493+
494+ struct gc_mark_args {
495+ Py_ssize_t enqueued ;
496+ Py_ssize_t dequeued ;
497+ _PyObjectStack stack ;
498+ PyObject * buffer [BUFFER_SIZE ];
499+ };
500+
501+ // Called when we run out of space in the buffer. The object will be added
502+ // to gc_mark_args.stack instead.
467503static int
468- mark_alive_stack_push ( PyObject * op , _PyObjectStack * stack )
504+ gc_mark_stack_push ( _PyObjectStack * ms , PyObject * op )
469505{
470- if (op == NULL ) {
471- return 0 ;
506+ if (_PyObjectStack_Push ( ms , op ) < 0 ) {
507+ return -1 ;
472508 }
473- if (!_PyObject_GC_IS_TRACKED (op )) {
509+ return 0 ;
510+ }
511+
512+ // Called when there is space in the buffer for the object. Add it to the end
513+ // of the buffer and issue the prefetch instruction.
514+ static inline void
515+ gc_mark_buffer_push (PyObject * op , struct gc_mark_args * args )
516+ {
517+ #if Py_DEBUG
518+ Py_ssize_t buf_used = args -> enqueued - args -> dequeued ;
519+ assert (buf_used < BUFFER_SIZE );
520+ #endif
521+ args -> buffer [args -> enqueued % BUFFER_SIZE ] = op ;
522+ args -> enqueued ++ ;
523+ prefetch (op );
524+ }
525+
526+ // Called when we find an object that needs to be marked alive (either from a
527+ // root or from calling tp_traverse).
528+ static int
529+ gc_mark_enqueue (PyObject * op , struct gc_mark_args * args )
530+ {
531+ assert (op != NULL );
532+ if (args -> enqueued - args -> dequeued < BUFFER_SIZE ) {
533+ gc_mark_buffer_push (op , args );
474534 return 0 ;
475535 }
476- if (gc_is_alive (op )) {
477- return 0 ; // already visited this object
478- }
479- if (gc_maybe_untrack (op )) {
480- return 0 ; // was untracked, don't visit it
481- }
482-
483- // Need to call tp_traverse on this object. Add to stack and mark it
484- // alive so we don't traverse it a second time.
485- gc_set_alive (op );
486- if (_PyObjectStack_Push (stack , op ) < 0 ) {
487- return -1 ;
536+ else {
537+ return gc_mark_stack_push (& args -> stack , op );
488538 }
489- return 0 ;
490539}
491540
492541static bool
@@ -503,36 +552,68 @@ gc_clear_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area,
503552 return true;
504553}
505554
555+ static int
556+ gc_mark_traverse_list (PyObject * self , void * args )
557+ {
558+ PyListObject * list = (PyListObject * )self ;
559+ if (list -> ob_item == NULL ) {
560+ return 0 ;
561+ }
562+ for (Py_ssize_t i = 0 ; i < Py_SIZE (list ); i ++ ) {
563+ if (gc_mark_enqueue (list -> ob_item [i ], args ) < 0 ) {
564+ return -1 ;
565+ }
566+ }
567+ return 0 ;
568+ }
569+
570+ static int
571+ gc_mark_traverse_tuple (PyObject * self , void * args )
572+ {
573+ _PyTuple_MaybeUntrack (self );
574+ if (!gc_has_bit (self , _PyGC_BITS_TRACKED )) {
575+ return 0 ;
576+ }
577+ PyTupleObject * tuple = _PyTuple_CAST (self );
578+ for (Py_ssize_t i = Py_SIZE (tuple ); -- i >= 0 ; ) {
579+ PyObject * item = tuple -> ob_item [i ];
580+ if (item == NULL ) {
581+ continue ;
582+ }
583+ if (gc_mark_enqueue (tuple -> ob_item [i ], args ) < 0 ) {
584+ return -1 ;
585+ }
586+ }
587+ return 0 ;
588+ }
589+
506590static void
507591gc_abort_mark_alive (PyInterpreterState * interp ,
508592 struct collection_state * state ,
509- _PyObjectStack * stack )
593+ struct gc_mark_args * args )
510594{
511595 // We failed to allocate memory for "stack" while doing the "mark
512596 // alive" phase. In that case, free the object stack and make sure
513597 // that no objects have the alive bit set.
514- _PyObjectStack_Clear (stack );
598+ _PyObjectStack_Clear (& args -> stack );
515599 gc_visit_heaps (interp , & gc_clear_alive_bits , & state -> base );
516600}
517601
518602#ifdef GC_MARK_ALIVE_STACKS
519603static int
520- gc_visit_stackref_mark_alive (_PyObjectStack * stack , _PyStackRef stackref )
604+ gc_visit_stackref_mark_alive (struct gc_mark_args * args , _PyStackRef stackref )
521605{
522- // Note: we MUST check that it is deferred before checking the rest.
523- // Otherwise we might read into invalid memory due to non-deferred references
524- // being dead already.
525- if (PyStackRef_IsDeferred (stackref ) && !PyStackRef_IsNull (stackref )) {
606+ if (!PyStackRef_IsNull (stackref )) {
526607 PyObject * op = PyStackRef_AsPyObjectBorrow (stackref );
527- if (mark_alive_stack_push (op , stack ) < 0 ) {
608+ if (gc_mark_enqueue (op , args ) < 0 ) {
528609 return -1 ;
529610 }
530611 }
531612 return 0 ;
532613}
533614
534615static int
535- gc_visit_thread_stacks_mark_alive (PyInterpreterState * interp , _PyObjectStack * stack )
616+ gc_visit_thread_stacks_mark_alive (PyInterpreterState * interp , struct gc_mark_args * args )
536617{
537618 _Py_FOR_EACH_TSTATE_BEGIN (interp , p ) {
538619 for (_PyInterpreterFrame * f = p -> current_frame ; f != NULL ; f = f -> previous ) {
@@ -542,12 +623,12 @@ gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *st
542623 }
543624
544625 PyCodeObject * co = (PyCodeObject * )executable ;
545- int max_stack = co -> co_nlocalsplus + co -> co_stacksize ;
546- if (gc_visit_stackref_mark_alive (stack , f -> f_executable ) < 0 ) {
626+ int max_stack = co -> co_nlocals ;
627+ if (gc_visit_stackref_mark_alive (args , f -> f_executable ) < 0 ) {
547628 return -1 ;
548629 }
549630 for (int i = 0 ; i < max_stack ; i ++ ) {
550- if (gc_visit_stackref_mark_alive (stack , f -> localsplus [i ]) < 0 ) {
631+ if (gc_visit_stackref_mark_alive (args , f -> localsplus [i ]) < 0 ) {
551632 return -1 ;
552633 }
553634 }
@@ -880,22 +961,73 @@ static int
880961move_legacy_finalizer_reachable (struct collection_state * state );
881962
882963#ifdef GC_ENABLE_MARK_ALIVE
883- static int
884- propagate_alive_bits (_PyObjectStack * stack )
964+
965+ static void
966+ gc_mark_buffer_prime (struct gc_mark_args * args )
885967{
886968 for (;;) {
887- PyObject * op = _PyObjectStack_Pop (stack );
969+ Py_ssize_t buf_used = args -> enqueued - args -> dequeued ;
970+ if (buf_used >= BUFFER_HI ) {
971+ // When priming, don't fill the buffer since that would
972+ // likely cause the stack to be used shortly after when it
973+ // fills. We want to use the buffer as much as possible and
974+ // so we only fill to BUFFER_HI, not BUFFER_SIZE.
975+ return ;
976+ }
977+ PyObject * op = _PyObjectStack_Pop (& args -> stack );
888978 if (op == NULL ) {
889979 break ;
890980 }
891- assert (_PyObject_GC_IS_TRACKED (op ));
892- assert (gc_is_alive (op ));
981+ gc_mark_buffer_push (op , args );
982+ }
983+ }
984+
985+ static int
986+ gc_propagate_alive (struct gc_mark_args * args )
987+ {
988+ for (;;) {
989+ Py_ssize_t buf_used = args -> enqueued - args -> dequeued ;
990+ if (buf_used <= BUFFER_LO ) {
991+ // The mark buffer is getting empty. If it's too empty
992+ // then there will not be enough delay between issuing
993+ // the prefetch vs when the object is actually accessed.
994+ // Prime the buffer with object pointers from the stack,
995+ // if there are any available.
996+ gc_mark_buffer_prime (args );
997+ if (args -> enqueued == args -> dequeued ) {
998+ return 0 ; // stack and buffer are both empty
999+ }
1000+ }
1001+ PyObject * op = args -> buffer [args -> dequeued % BUFFER_SIZE ];
1002+ args -> dequeued ++ ;
1003+
1004+ if (!gc_has_bit (op , _PyGC_BITS_TRACKED )) {
1005+ continue ;
1006+ }
1007+
1008+ if (gc_is_alive (op )) {
1009+ continue ; // already visited this object
1010+ }
1011+
1012+ // Need to call tp_traverse on this object. Mark it alive so we
1013+ // don't traverse it a second time.
1014+ gc_set_alive (op );
1015+
8931016 traverseproc traverse = Py_TYPE (op )-> tp_traverse ;
894- if (traverse (op , (visitproc )& mark_alive_stack_push , stack ) < 0 ) {
1017+ if (traverse == PyList_Type .tp_traverse ) {
1018+ if (gc_mark_traverse_list (op , args ) < 0 ) {
1019+ return -1 ;
1020+ }
1021+ }
1022+ else if (traverse == PyTuple_Type .tp_traverse ) {
1023+ if (gc_mark_traverse_tuple (op , args ) < 0 ) {
1024+ return -1 ;
1025+ }
1026+ }
1027+ else if (traverse (op , (visitproc )& gc_mark_enqueue , args ) < 0 ) {
8951028 return -1 ;
8961029 }
8971030 }
898- return 0 ;
8991031}
9001032
9011033// Using tp_traverse, mark everything reachable from known root objects
@@ -915,48 +1047,52 @@ propagate_alive_bits(_PyObjectStack *stack)
9151047//
9161048// Returns -1 on failure (out of memory).
9171049static int
918- mark_alive_from_roots (PyInterpreterState * interp ,
919- struct collection_state * state )
1050+ gc_mark_alive_from_roots (PyInterpreterState * interp ,
1051+ struct collection_state * state )
9201052{
9211053#ifdef GC_DEBUG
9221054 // Check that all objects don't have alive bit set
9231055 gc_visit_heaps (interp , & validate_alive_bits , & state -> base );
9241056#endif
925- _PyObjectStack stack = { NULL };
926-
927- #define STACK_PUSH (op ) \
928- if (mark_alive_stack_push(op, &stack) < 0) { \
929- gc_abort_mark_alive(interp, state, &stack); \
930- return -1; \
1057+ struct gc_mark_args mark_args = { 0 };
1058+
1059+ #define MARK_ENQUEUE (op ) \
1060+ if (op != NULL ) { \
1061+ if (gc_mark_enqueue(op, &mark_args) < 0) { \
1062+ gc_abort_mark_alive(interp, state, &mark_args); \
1063+ return -1; \
1064+ } \
9311065 }
932- STACK_PUSH (interp -> sysdict );
1066+ MARK_ENQUEUE (interp -> sysdict );
9331067#ifdef GC_MARK_ALIVE_EXTRA_ROOTS
934- STACK_PUSH (interp -> builtins );
935- STACK_PUSH (interp -> dict );
1068+ MARK_ENQUEUE (interp -> builtins );
1069+ MARK_ENQUEUE (interp -> dict );
9361070 struct types_state * types = & interp -> types ;
9371071 for (int i = 0 ; i < _Py_MAX_MANAGED_STATIC_BUILTIN_TYPES ; i ++ ) {
938- STACK_PUSH (types -> builtins .initialized [i ].tp_dict );
939- STACK_PUSH (types -> builtins .initialized [i ].tp_subclasses );
1072+ MARK_ENQUEUE (types -> builtins .initialized [i ].tp_dict );
1073+ MARK_ENQUEUE (types -> builtins .initialized [i ].tp_subclasses );
9401074 }
9411075 for (int i = 0 ; i < _Py_MAX_MANAGED_STATIC_EXT_TYPES ; i ++ ) {
942- STACK_PUSH (types -> for_extensions .initialized [i ].tp_dict );
943- STACK_PUSH (types -> for_extensions .initialized [i ].tp_subclasses );
1076+ MARK_ENQUEUE (types -> for_extensions .initialized [i ].tp_dict );
1077+ MARK_ENQUEUE (types -> for_extensions .initialized [i ].tp_subclasses );
9441078 }
9451079#endif
9461080#ifdef GC_MARK_ALIVE_STACKS
947- if (gc_visit_thread_stacks_mark_alive (interp , & stack ) < 0 ) {
948- gc_abort_mark_alive (interp , state , & stack );
1081+ if (gc_visit_thread_stacks_mark_alive (interp , & mark_args ) < 0 ) {
1082+ gc_abort_mark_alive (interp , state , & mark_args );
9491083 return -1 ;
9501084 }
9511085#endif
952- #undef STACK_PUSH
1086+ #undef MARK_ENQUEUE
9531087
9541088 // Use tp_traverse to find everything reachable from roots.
955- if (propagate_alive_bits ( & stack ) < 0 ) {
956- gc_abort_mark_alive (interp , state , & stack );
1089+ if (gc_propagate_alive ( & mark_args ) < 0 ) {
1090+ gc_abort_mark_alive (interp , state , & mark_args );
9571091 return -1 ;
9581092 }
9591093
1094+ assert (mark_args .stack .head == NULL );
1095+
9601096 return 0 ;
9611097}
9621098#endif // GC_ENABLE_MARK_ALIVE
@@ -1531,7 +1667,7 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state,
15311667 if (!state -> gcstate -> freeze_active ) {
15321668 // Mark objects reachable from known roots as "alive". These will
15331669 // be ignored for rest of the GC pass.
1534- int err = mark_alive_from_roots (interp , state );
1670+ int err = gc_mark_alive_from_roots (interp , state );
15351671 if (err < 0 ) {
15361672 _PyEval_StartTheWorld (interp );
15371673 PyErr_NoMemory ();
0 commit comments