@@ -123,6 +123,33 @@ _PyMem_mi_page_is_safe_to_free(mi_page_t *page)
123123
124124}
125125
126+ #ifdef Py_GIL_DISABLED
127+
128+ // If we are deferring collection of more than this amount of memory for
129+ // mimalloc pages, advance the write sequence. Advancing allows these
130+ // pages to be re-used in a different thread or for a different size class.
131+ #define QSBR_PAGE_MEM_LIMIT 4096*20
132+
133+ // Return true if the global write sequence should be advanced for a mimalloc
134+ // page that is deferred from collection.
135+ static bool
136+ should_advance_qsbr_for_page (struct _qsbr_thread_state * qsbr , mi_page_t * page )
137+ {
138+ size_t bsize = mi_page_block_size (page );
139+ size_t page_size = page -> capacity * bsize ;
140+ if (page_size > QSBR_PAGE_MEM_LIMIT ) {
141+ qsbr -> deferred_page_memory = 0 ;
142+ return true;
143+ }
144+ qsbr -> deferred_page_memory += page_size ;
145+ if (qsbr -> deferred_page_memory > QSBR_PAGE_MEM_LIMIT ) {
146+ qsbr -> deferred_page_memory = 0 ;
147+ return true;
148+ }
149+ return false;
150+ }
151+ #endif
152+
126153static bool
127154_PyMem_mi_page_maybe_free (mi_page_t * page , mi_page_queue_t * pq , bool force )
128155{
@@ -138,7 +165,14 @@ _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force)
138165
139166 _PyMem_mi_page_clear_qsbr (page );
140167 page -> retire_expire = 0 ;
141- page -> qsbr_goal = _Py_qsbr_deferred_advance (tstate -> qsbr );
168+
169+ if (should_advance_qsbr_for_page (tstate -> qsbr , page )) {
170+ page -> qsbr_goal = _Py_qsbr_advance (tstate -> qsbr -> shared );
171+ }
172+ else {
173+ page -> qsbr_goal = _Py_qsbr_shared_next (tstate -> qsbr -> shared );
174+ }
175+
142176 llist_insert_tail (& tstate -> mimalloc .page_list , & page -> qsbr_node );
143177 return false;
144178 }
@@ -1103,8 +1137,44 @@ free_work_item(uintptr_t ptr)
11031137 }
11041138}
11051139
1140+
1141+ #ifdef Py_GIL_DISABLED
1142+
1143+ // For deferred advance on free: the number of deferred items before advancing
1144+ // the write sequence. This is based on WORK_ITEMS_PER_CHUNK. We ideally
1145+ // want to process a chunk before it overflows.
1146+ #define QSBR_DEFERRED_LIMIT 127
1147+
1148+ // If the deferred memory exceeds 1 MiB, advance the write sequence. This
1149+ // helps limit memory usage due to QSBR delaying frees too long.
1150+ #define QSBR_FREE_MEM_LIMIT 1024*1024
1151+
1152+ // Return true if the global write sequence should be advanced for a deferred
1153+ // memory free.
1154+ static bool
1155+ should_advance_qsbr_for_free (struct _qsbr_thread_state * qsbr , size_t size )
1156+ {
1157+ if (size > QSBR_FREE_MEM_LIMIT ) {
1158+ qsbr -> deferred_count = 0 ;
1159+ qsbr -> deferred_memory = 0 ;
1160+ qsbr -> should_process = true;
1161+ return true;
1162+ }
1163+ qsbr -> deferred_count ++ ;
1164+ qsbr -> deferred_memory += size ;
1165+ if (qsbr -> deferred_count > QSBR_DEFERRED_LIMIT ||
1166+ qsbr -> deferred_memory > QSBR_FREE_MEM_LIMIT ) {
1167+ qsbr -> deferred_count = 0 ;
1168+ qsbr -> deferred_memory = 0 ;
1169+ qsbr -> should_process = true;
1170+ return true;
1171+ }
1172+ return false;
1173+ }
1174+ #endif
1175+
11061176static void
1107- free_delayed (uintptr_t ptr )
1177+ free_delayed (uintptr_t ptr , size_t size )
11081178{
11091179#ifndef Py_GIL_DISABLED
11101180 free_work_item (ptr );
@@ -1145,31 +1215,43 @@ free_delayed(uintptr_t ptr)
11451215 }
11461216
11471217 assert (buf != NULL && buf -> wr_idx < WORK_ITEMS_PER_CHUNK );
1148- uint64_t seq = _Py_qsbr_deferred_advance (tstate -> qsbr );
1218+ uint64_t seq ;
1219+ if (should_advance_qsbr_for_free (tstate -> qsbr , size )) {
1220+ seq = _Py_qsbr_advance (tstate -> qsbr -> shared );
1221+ }
1222+ else {
1223+ seq = _Py_qsbr_shared_next (tstate -> qsbr -> shared );
1224+ }
11491225 buf -> array [buf -> wr_idx ].ptr = ptr ;
11501226 buf -> array [buf -> wr_idx ].qsbr_goal = seq ;
11511227 buf -> wr_idx ++ ;
11521228
11531229 if (buf -> wr_idx == WORK_ITEMS_PER_CHUNK ) {
1230+ // Normally the processing of delayed items is done from the eval
1231+ // breaker. Processing here is a safety measure to ensure too much
1232+ // work does not accumulate.
11541233 _PyMem_ProcessDelayed ((PyThreadState * )tstate );
11551234 }
11561235#endif
11571236}
11581237
11591238void
1160- _PyMem_FreeDelayed (void * ptr )
1239+ _PyMem_FreeDelayed (void * ptr , size_t size )
11611240{
11621241 assert (!((uintptr_t )ptr & 0x01 ));
11631242 if (ptr != NULL ) {
1164- free_delayed ((uintptr_t )ptr );
1243+ free_delayed ((uintptr_t )ptr , size );
11651244 }
11661245}
11671246
11681247void
11691248_PyObject_FreeDelayed (void * ptr )
11701249{
11711250 assert (!((uintptr_t )ptr & 0x01 ));
1172- free_delayed (((uintptr_t )ptr )|0x01 );
1251+ // We use 0 as the size since we don't have an easy way to know the
1252+ // actual size. If we are freeing many objects, the write sequence
1253+ // will be advanced due to QSBR_DEFERRED_LIMIT.
1254+ free_delayed (((uintptr_t )ptr )|0x01 , 0 );
11731255}
11741256
11751257static struct _mem_work_chunk *
@@ -1239,6 +1321,8 @@ _PyMem_ProcessDelayed(PyThreadState *tstate)
12391321 PyInterpreterState * interp = tstate -> interp ;
12401322 _PyThreadStateImpl * tstate_impl = (_PyThreadStateImpl * )tstate ;
12411323
1324+ tstate_impl -> qsbr -> should_process = false;
1325+
12421326 // Process thread-local work
12431327 process_queue (& tstate_impl -> mem_free_queue , tstate_impl -> qsbr , true);
12441328
0 commit comments