diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h index 38a1c56c09d9db..479fe10d00066d 100644 --- a/Include/internal/pycore_gc.h +++ b/Include/internal/pycore_gc.h @@ -50,7 +50,6 @@ static inline PyObject* _Py_FROM_GC(PyGC_Head *gc) { # define _PyGC_BITS_UNREACHABLE (4) # define _PyGC_BITS_FROZEN (8) # define _PyGC_BITS_SHARED (16) -# define _PyGC_BITS_SHARED_INLINE (32) # define _PyGC_BITS_DEFERRED (64) // Use deferred reference counting #endif @@ -119,23 +118,6 @@ static inline void _PyObject_GC_SET_SHARED(PyObject *op) { } #define _PyObject_GC_SET_SHARED(op) _PyObject_GC_SET_SHARED(_Py_CAST(PyObject*, op)) -/* True if the memory of the object is shared between multiple - * threads and needs special purpose when freeing due to - * the possibility of in-flight lock-free reads occurring. - * Objects with this bit that are GC objects will automatically - * delay-freed by PyObject_GC_Del. */ -static inline int _PyObject_GC_IS_SHARED_INLINE(PyObject *op) { - return _PyObject_HAS_GC_BITS(op, _PyGC_BITS_SHARED_INLINE); -} -#define _PyObject_GC_IS_SHARED_INLINE(op) \ - _PyObject_GC_IS_SHARED_INLINE(_Py_CAST(PyObject*, op)) - -static inline void _PyObject_GC_SET_SHARED_INLINE(PyObject *op) { - _PyObject_SET_GC_BITS(op, _PyGC_BITS_SHARED_INLINE); -} -#define _PyObject_GC_SET_SHARED_INLINE(op) \ - _PyObject_GC_SET_SHARED_INLINE(_Py_CAST(PyObject*, op)) - #endif /* Bit flags for _gc_prev */ diff --git a/Include/internal/pycore_pymem.h b/Include/internal/pycore_pymem.h index dd6b0762370c92..5bb34001aab1b4 100644 --- a/Include/internal/pycore_pymem.h +++ b/Include/internal/pycore_pymem.h @@ -120,11 +120,25 @@ extern int _PyMem_DebugEnabled(void); extern void _PyMem_FreeDelayed(void *ptr); // Enqueue an object to be freed possibly after some delay -extern void _PyObject_FreeDelayed(void *ptr); +#ifdef Py_GIL_DISABLED +extern void _PyObject_XDecRefDelayed(PyObject *obj); +#else +static inline void _PyObject_XDecRefDelayed(PyObject *obj) +{ + Py_XDECREF(obj); +} +#endif // Periodically process delayed free requests. extern void _PyMem_ProcessDelayed(PyThreadState *tstate); + +// Periodically process delayed free requests when the world is stopped. +// Notify of any objects whic should be freeed. +typedef void (*delayed_dealloc_cb)(PyObject *, void *); +extern void _PyMem_ProcessDelayedNoDealloc(PyThreadState *tstate, + delayed_dealloc_cb cb, void *state); + // Abandon all thread-local delayed free requests and push them to the // interpreter's queue. extern void _PyMem_AbandonDelayed(PyThreadState *tstate); diff --git a/Lib/test/test_free_threading/test_dict.py b/Lib/test/test_free_threading/test_dict.py index 80daf0d9cae9e0..13717cb39fa35d 100644 --- a/Lib/test/test_free_threading/test_dict.py +++ b/Lib/test/test_free_threading/test_dict.py @@ -142,6 +142,70 @@ def writer_func(l): for ref in thread_list: self.assertIsNone(ref()) + def test_racing_set_object_dict(self): + """Races assigning to __dict__ should be thread safe""" + class C: pass + class MyDict(dict): pass + for cyclic in (False, True): + f = C() + f.__dict__ = {"foo": 42} + THREAD_COUNT = 10 + + def writer_func(l): + for i in range(1000): + if cyclic: + other_d = {} + d = MyDict({"foo": 100}) + if cyclic: + d["x"] = other_d + other_d["bar"] = d + l.append(weakref.ref(d)) + f.__dict__ = d + + def reader_func(): + for i in range(1000): + f.foo + + lists = [] + readers = [] + writers = [] + for x in range(THREAD_COUNT): + thread_list = [] + lists.append(thread_list) + writer = Thread(target=partial(writer_func, thread_list)) + writers.append(writer) + + for x in range(THREAD_COUNT): + reader = Thread(target=partial(reader_func)) + readers.append(reader) + + for writer in writers: + writer.start() + for reader in readers: + reader.start() + + for writer in writers: + writer.join() + + for reader in readers: + reader.join() + + f.__dict__ = {} + gc.collect() + gc.collect() + + count = 0 + ids = set() + for thread_list in lists: + for i, ref in enumerate(thread_list): + if ref() is None: + continue + count += 1 + ids.add(id(ref())) + count += 1 + + self.assertEqual(count, 0) + if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-09-25-21-50-23.gh-issue-124470.pFr3_d.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-09-25-21-50-23.gh-issue-124470.pFr3_d.rst new file mode 100644 index 00000000000000..8f2f37146d3c13 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-09-25-21-50-23.gh-issue-124470.pFr3_d.rst @@ -0,0 +1 @@ +Fix crash in free-threaded builds when replacing object dictionary while reading attribute on another thread diff --git a/Objects/dictobject.c b/Objects/dictobject.c index 23616b3918b0d6..393e9f91390809 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -7087,51 +7087,146 @@ set_dict_inline_values(PyObject *obj, PyDictObject *new_dict) } } -int -_PyObject_SetManagedDict(PyObject *obj, PyObject *new_dict) +#ifdef Py_GIL_DISABLED + +// Trys and sets the dictionary for an object in the easy case when our current +// dictionary is either completely not materialized or is a dictionary which +// does not point at the inline values. +static bool +try_set_dict_inline_only_or_other_dict(PyObject *obj, PyObject *new_dict, PyDictObject **cur_dict) +{ + bool replaced = false; + Py_BEGIN_CRITICAL_SECTION(obj); + + PyDictObject *dict = *cur_dict = _PyObject_GetManagedDict(obj); + if (dict == NULL) { + // We only have inline values, we can just completely replace them. + set_dict_inline_values(obj, (PyDictObject *)new_dict); + replaced = true; + goto exit_lock; + } + + if (FT_ATOMIC_LOAD_PTR_RELAXED(dict->ma_values) != _PyObject_InlineValues(obj)) { + // We have a materialized dict which doesn't point at the inline values, + // We get to simply swap dictionaries and free the old dictionary. + FT_ATOMIC_STORE_PTR(_PyObject_ManagedDictPointer(obj)->dict, + (PyDictObject *)Py_XNewRef(new_dict)); + replaced = true; + goto exit_lock; + } + else { + // We have inline values, we need to lock the dict and the object + // at the same time to safely dematerialize them. To do that while releasing + // the object lock we need a strong reference to the current dictionary. + Py_INCREF(dict); + } +exit_lock: + Py_END_CRITICAL_SECTION(); + return replaced; +} + +// Replaces a dictionary that is probably the dictionary which has been +// materialized and points at the inline values. We could have raced +// and replaced it with another dictionary though. +static int +replace_dict_probably_inline_materialized(PyObject *obj, PyDictObject *inline_dict, + PyDictObject *cur_dict, PyObject *new_dict) +{ + _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(obj); + + if (cur_dict == inline_dict) { + assert(FT_ATOMIC_LOAD_PTR_RELAXED(inline_dict->ma_values) == _PyObject_InlineValues(obj)); + + int err = _PyDict_DetachFromObject(inline_dict, obj); + if (err != 0) { + assert(new_dict == NULL); + return err; + } + } + + FT_ATOMIC_STORE_PTR(_PyObject_ManagedDictPointer(obj)->dict, + (PyDictObject *)Py_XNewRef(new_dict)); + return 0; +} + +#endif + +static void +decref_maybe_delay(PyObject *obj, bool delay) +{ + if (delay) { + _PyObject_XDecRefDelayed(obj); + } + else { + Py_XDECREF(obj); + } +} + +static int +set_or_clear_managed_dict(PyObject *obj, PyObject *new_dict, bool clear) { assert(Py_TYPE(obj)->tp_flags & Py_TPFLAGS_MANAGED_DICT); +#ifndef NDEBUG + Py_BEGIN_CRITICAL_SECTION(obj); assert(_PyObject_InlineValuesConsistencyCheck(obj)); + Py_END_CRITICAL_SECTION(); +#endif int err = 0; PyTypeObject *tp = Py_TYPE(obj); if (tp->tp_flags & Py_TPFLAGS_INLINE_VALUES) { - PyDictObject *dict = _PyObject_GetManagedDict(obj); - if (dict == NULL) { #ifdef Py_GIL_DISABLED - Py_BEGIN_CRITICAL_SECTION(obj); + PyDictObject *prev_dict; + if (!try_set_dict_inline_only_or_other_dict(obj, new_dict, &prev_dict)) { + // We had a materialized dictionary which pointed at the inline + // values. We need to lock both the object and the dict at the + // same time to safely replace it. We can't merely lock the dictionary + // while the object is locked because it could suspend the object lock. + PyDictObject *cur_dict; - dict = _PyObject_ManagedDictPointer(obj)->dict; - if (dict == NULL) { - set_dict_inline_values(obj, (PyDictObject *)new_dict); - } + assert(prev_dict != NULL); + Py_BEGIN_CRITICAL_SECTION2(obj, prev_dict); - Py_END_CRITICAL_SECTION(); + // We could have had another thread race in between the call to + // try_set_dict_inline_only_or_other_dict where we locked the object + // and when we unlocked and re-locked the dictionary. + cur_dict = _PyObject_GetManagedDict(obj); - if (dict == NULL) { - return 0; + err = replace_dict_probably_inline_materialized(obj, prev_dict, + cur_dict, new_dict); + + Py_END_CRITICAL_SECTION2(); + + // Decref for the dictionary we incref'd in try_set_dict_inline_only_or_other_dict + // while the object was locked + decref_maybe_delay((PyObject *)prev_dict, + !clear && prev_dict != cur_dict); + if (err != 0) { + return err; } -#else - set_dict_inline_values(obj, (PyDictObject *)new_dict); - return 0; -#endif - } - Py_BEGIN_CRITICAL_SECTION2(dict, obj); + prev_dict = cur_dict; + } - // We've locked dict, but the actual dict could have changed - // since we locked it. - dict = _PyObject_ManagedDictPointer(obj)->dict; - err = _PyDict_DetachFromObject(dict, obj); - assert(err == 0 || new_dict == NULL); - if (err == 0) { - FT_ATOMIC_STORE_PTR(_PyObject_ManagedDictPointer(obj)->dict, - (PyDictObject *)Py_XNewRef(new_dict)); + if (prev_dict != NULL) { + // decref for the dictionary that we replaced + decref_maybe_delay((PyObject *)prev_dict, !clear); } - Py_END_CRITICAL_SECTION2(); - if (err == 0) { - Py_XDECREF(dict); + return 0; +#else + PyDictObject *dict = _PyObject_GetManagedDict(obj); + if (dict == NULL) { + set_dict_inline_values(obj, (PyDictObject *)new_dict); + return 0; + } + if (_PyDict_DetachFromObject(dict, obj) == 0) { + _PyObject_ManagedDictPointer(obj)->dict = (PyDictObject *)Py_XNewRef(new_dict); + Py_DECREF(dict); + return 0; } + assert(new_dict == NULL); + return -1; +#endif } else { PyDictObject *dict; @@ -7144,17 +7239,22 @@ _PyObject_SetManagedDict(PyObject *obj, PyObject *new_dict) (PyDictObject *)Py_XNewRef(new_dict)); Py_END_CRITICAL_SECTION(); - - Py_XDECREF(dict); + decref_maybe_delay((PyObject *)dict, !clear); } assert(_PyObject_InlineValuesConsistencyCheck(obj)); return err; } +int +_PyObject_SetManagedDict(PyObject *obj, PyObject *new_dict) +{ + return set_or_clear_managed_dict(obj, new_dict, false); +} + void PyObject_ClearManagedDict(PyObject *obj) { - if (_PyObject_SetManagedDict(obj, NULL) < 0) { + if (set_or_clear_managed_dict(obj, NULL, true) < 0) { /* Must be out of memory */ assert(PyErr_Occurred() == PyExc_MemoryError); PyErr_WriteUnraisable(NULL); diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index dfeccfa4dd7658..3d6b1ab1840e73 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -1093,10 +1093,24 @@ struct _mem_work_chunk { }; static void -free_work_item(uintptr_t ptr) +free_work_item(uintptr_t ptr, delayed_dealloc_cb cb, void *state) { if (ptr & 0x01) { - PyObject_Free((char *)(ptr - 1)); + PyObject *obj = (PyObject *)(ptr - 1); +#ifdef Py_GIL_DISABLED + if (cb == NULL) { + assert(!_PyInterpreterState_GET()->stoptheworld.world_stopped); + Py_DECREF(obj); + return; + } + + Py_ssize_t refcount = _Py_ExplicitMergeRefcount(obj, -1); + if (refcount == 0) { + cb(obj, state); + } +#else + Py_DECREF(obj); +#endif } else { PyMem_Free((void *)ptr); @@ -1107,7 +1121,7 @@ static void free_delayed(uintptr_t ptr) { #ifndef Py_GIL_DISABLED - free_work_item(ptr); + free_work_item(ptr, NULL, NULL); #else PyInterpreterState *interp = _PyInterpreterState_GET(); if (_PyInterpreterState_GetFinalizing(interp) != NULL || @@ -1115,7 +1129,8 @@ free_delayed(uintptr_t ptr) { // Free immediately during interpreter shutdown or if the world is // stopped. - free_work_item(ptr); + assert(!interp->stoptheworld.world_stopped || !(ptr & 0x01)); + free_work_item(ptr, NULL, NULL); return; } @@ -1142,7 +1157,8 @@ free_delayed(uintptr_t ptr) if (buf == NULL) { // failed to allocate a buffer, free immediately _PyEval_StopTheWorld(tstate->base.interp); - free_work_item(ptr); + // TODO: Fix me + free_work_item(ptr, NULL, NULL); _PyEval_StartTheWorld(tstate->base.interp); return; } @@ -1166,12 +1182,16 @@ _PyMem_FreeDelayed(void *ptr) free_delayed((uintptr_t)ptr); } +#ifdef Py_GIL_DISABLED void -_PyObject_FreeDelayed(void *ptr) +_PyObject_XDecRefDelayed(PyObject *ptr) { assert(!((uintptr_t)ptr & 0x01)); - free_delayed(((uintptr_t)ptr)|0x01); + if (ptr != NULL) { + free_delayed(((uintptr_t)ptr)|0x01); + } } +#endif static struct _mem_work_chunk * work_queue_first(struct llist_node *head) @@ -1181,7 +1201,7 @@ work_queue_first(struct llist_node *head) static void process_queue(struct llist_node *head, struct _qsbr_thread_state *qsbr, - bool keep_empty) + bool keep_empty, delayed_dealloc_cb cb, void *state) { while (!llist_empty(head)) { struct _mem_work_chunk *buf = work_queue_first(head); @@ -1192,7 +1212,7 @@ process_queue(struct llist_node *head, struct _qsbr_thread_state *qsbr, return; } - free_work_item(item->ptr); + free_work_item(item->ptr, cb, state); buf->rd_idx++; } @@ -1210,7 +1230,8 @@ process_queue(struct llist_node *head, struct _qsbr_thread_state *qsbr, static void process_interp_queue(struct _Py_mem_interp_free_queue *queue, - struct _qsbr_thread_state *qsbr) + struct _qsbr_thread_state *qsbr, delayed_dealloc_cb cb, + void *state) { if (!_Py_atomic_load_int_relaxed(&queue->has_work)) { return; @@ -1218,7 +1239,7 @@ process_interp_queue(struct _Py_mem_interp_free_queue *queue, // Try to acquire the lock, but don't block if it's already held. if (_PyMutex_LockTimed(&queue->mutex, 0, 0) == PY_LOCK_ACQUIRED) { - process_queue(&queue->head, qsbr, false); + process_queue(&queue->head, qsbr, false, cb, state); int more_work = !llist_empty(&queue->head); _Py_atomic_store_int_relaxed(&queue->has_work, more_work); @@ -1234,10 +1255,23 @@ _PyMem_ProcessDelayed(PyThreadState *tstate) _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate; // Process thread-local work - process_queue(&tstate_impl->mem_free_queue, tstate_impl->qsbr, true); + process_queue(&tstate_impl->mem_free_queue, tstate_impl->qsbr, true, NULL, NULL); + + // Process shared interpreter work + process_interp_queue(&interp->mem_free_queue, tstate_impl->qsbr, NULL, NULL); +} + +void +_PyMem_ProcessDelayedNoDealloc(PyThreadState *tstate, delayed_dealloc_cb cb, void *state) +{ + PyInterpreterState *interp = tstate->interp; + _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate; + + // Process thread-local work + process_queue(&tstate_impl->mem_free_queue, tstate_impl->qsbr, true, cb, state); // Process shared interpreter work - process_interp_queue(&interp->mem_free_queue, tstate_impl->qsbr); + process_interp_queue(&interp->mem_free_queue, tstate_impl->qsbr, cb, state); } void @@ -1279,7 +1313,7 @@ _PyMem_FiniDelayed(PyInterpreterState *interp) // Free the remaining items immediately. There should be no other // threads accessing the memory at this point during shutdown. struct _mem_work_item *item = &buf->array[buf->rd_idx]; - free_work_item(item->ptr); + free_work_item(item->ptr, NULL, NULL); buf->rd_idx++; } diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index a6e0022340b7fe..0920c616c3c6b0 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -393,6 +393,23 @@ gc_visit_thread_stacks(PyInterpreterState *interp) HEAD_UNLOCK(&_PyRuntime); } +static void +queue_untracked_obj_decref(PyObject *op, struct collection_state *state) +{ + if (!_PyObject_GC_IS_TRACKED(op)) { + // GC objects with zero refcount are handled subsequently by the + // GC as if they were cyclic trash, but we have to handle dead + // non-GC objects here. Add one to the refcount so that we can + // decref and deallocate the object once we start the world again. + op->ob_ref_shared += (1 << _Py_REF_SHARED_SHIFT); +#ifdef Py_REF_DEBUG + _Py_IncRefTotal(_PyThreadState_GET()); +#endif + worklist_push(&state->objs_to_decref, op); + } + +} + static void merge_queued_objects(_PyThreadStateImpl *tstate, struct collection_state *state) { @@ -404,22 +421,20 @@ merge_queued_objects(_PyThreadStateImpl *tstate, struct collection_state *state) // Subtract one when merging because the queue had a reference. Py_ssize_t refcount = merge_refcount(op, -1); - if (!_PyObject_GC_IS_TRACKED(op) && refcount == 0) { - // GC objects with zero refcount are handled subsequently by the - // GC as if they were cyclic trash, but we have to handle dead - // non-GC objects here. Add one to the refcount so that we can - // decref and deallocate the object once we start the world again. - op->ob_ref_shared += (1 << _Py_REF_SHARED_SHIFT); -#ifdef Py_REF_DEBUG - _Py_IncRefTotal(_PyThreadState_GET()); -#endif - worklist_push(&state->objs_to_decref, op); + if (refcount == 0) { + queue_untracked_obj_decref(op, state); } } } static void -process_delayed_frees(PyInterpreterState *interp) +queue_freed_object(PyObject *obj, void *arg) +{ + queue_untracked_obj_decref(obj, arg); +} + +static void +process_delayed_frees(PyInterpreterState *interp, struct collection_state *state) { // While we are in a "stop the world" pause, we can observe the latest // write sequence by advancing the write sequence immediately. @@ -438,7 +453,7 @@ process_delayed_frees(PyInterpreterState *interp) } HEAD_UNLOCK(&_PyRuntime); - _PyMem_ProcessDelayed((PyThreadState *)current_tstate); + _PyMem_ProcessDelayedNoDealloc((PyThreadState *)current_tstate, queue_freed_object, state); } // Subtract an incoming reference from the computed "gc_refs" refcount. @@ -1231,7 +1246,7 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, } HEAD_UNLOCK(&_PyRuntime); - process_delayed_frees(interp); + process_delayed_frees(interp, state); // Find unreachable objects int err = deduce_unreachable_heap(interp, state); @@ -1910,13 +1925,7 @@ PyObject_GC_Del(void *op) } record_deallocation(_PyThreadState_GET()); - PyObject *self = (PyObject *)op; - if (_PyObject_GC_IS_SHARED_INLINE(self)) { - _PyObject_FreeDelayed(((char *)op)-presize); - } - else { - PyObject_Free(((char *)op)-presize); - } + PyObject_Free(((char *)op)-presize); } int