From a3da6027c6551d85ec758caa6cf23b970fe7b0fb Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Sat, 18 Apr 2026 01:13:15 -0700 Subject: [PATCH 1/5] GH-148937: fix for free-threaded GC (RSS based defer) Asking the OS for the process memory usage doesn't work will given how mimalloc works. It does not promptly return memory to the OS and so the memory doesn't drop after cyclic trash is freed. Instead of asking the OS, use mimalloc APIs to compute how much memory is being used by all mimalloc arenas. We need to stop-the-world to do this but usually we can avoid doing a collection. So, from a performance perspective, this is worth it. --- Include/internal/pycore_interp_structs.h | 11 +- Python/gc_free_threading.c | 271 +++++++---------------- 2 files changed, 82 insertions(+), 200 deletions(-) diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index fb810c82a5aa63..f97ec647b37c81 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -281,15 +281,14 @@ struct _gc_runtime_state { /* True if gc.freeze() has been used. */ int freeze_active; - /* Memory usage of the process (RSS + swap) after last GC. */ - Py_ssize_t last_mem; + /* Sum of area->used*area->block_size across all mimalloc heaps after last + GC, in KB. Updated under stop-the-world so the measurement is accurate + even when OS pages are being reused. */ + Py_ssize_t last_gc_used; /* This accumulates the new object count whenever collection is deferred - due to the RSS increase condition not being meet. Reset on collection. */ + due to memory usage not increasing enough. Reset on collection. */ Py_ssize_t deferred_count; - - /* Mutex held for gc_should_collect_mem_usage(). */ - PyMutex mutex; #endif }; diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 4b46ca04f56b20..25de084b1203ae 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -17,30 +17,7 @@ #include "pydtrace.h" -// Platform-specific includes for get_process_mem_usage(). -#ifdef _WIN32 - #include - #include // For GetProcessMemoryInfo -#elif defined(__linux__) - #include // For sysconf, getpid -#elif defined(__APPLE__) - #include - #include // Required for TASK_VM_INFO - #include // For sysconf, getpid -#elif defined(__FreeBSD__) - #include - #include - #include // Requires sys/user.h for kinfo_proc definition - #include - #include // For sysconf, getpid - #include // For O_RDONLY - #include // For _POSIX2_LINE_MAX -#elif defined(__OpenBSD__) - #include - #include - #include // For kinfo_proc - #include // For sysconf, getpid -#endif +#include "pycore_mimalloc.h" // mi_heap_visit_blocks() // enable the "mark alive" pass of GC #define GC_ENABLE_MARK_ALIVE 1 @@ -2016,188 +1993,93 @@ cleanup_worklist(struct worklist *worklist) } } -// Return the memory usage (typically RSS + swap) of the process, in units of -// KB. Returns -1 if this operation is not supported or on failure. -static Py_ssize_t -get_process_mem_usage(void) -{ -#ifdef _WIN32 - // Windows implementation using GetProcessMemoryInfo - // Returns WorkingSetSize + PagefileUsage - PROCESS_MEMORY_COUNTERS pmc; - HANDLE hProcess = GetCurrentProcess(); - if (NULL == hProcess) { - // Should not happen for the current process - return -1; - } - - // GetProcessMemoryInfo returns non-zero on success - if (GetProcessMemoryInfo(hProcess, &pmc, sizeof(pmc))) { - // Values are in bytes, convert to KB. - return (Py_ssize_t)((pmc.WorkingSetSize + pmc.PagefileUsage) / 1024); - } - else { - return -1; - } +// Visitor for get_all_mimalloc_used_kb(): called once per heap area. +struct count_used_area_args { + Py_ssize_t total_bytes; +}; -#elif __linux__ - FILE* fp = fopen("/proc/self/status", "r"); - if (fp == NULL) { - return -1; +static bool +count_used_area_visitor(const mi_heap_t *heap, const mi_heap_area_t *area, + void *block, size_t block_size, void *arg) +{ + if (block == NULL) { + // Called once per area when visit_all_blocks=false. + ((struct count_used_area_args *)arg)->total_bytes += + (Py_ssize_t)(area->used * area->block_size); } + return true; +} - char line_buffer[256]; - long long rss_kb = -1; - long long swap_kb = -1; - - while (fgets(line_buffer, sizeof(line_buffer), fp) != NULL) { - if (rss_kb == -1 && strncmp(line_buffer, "VmRSS:", 6) == 0) { - sscanf(line_buffer + 6, "%lld", &rss_kb); - } - else if (swap_kb == -1 && strncmp(line_buffer, "VmSwap:", 7) == 0) { - sscanf(line_buffer + 7, "%lld", &swap_kb); +// Return the total bytes in use across all mimalloc heaps for all threads, in +// KB. Requires the world to be stopped so heap structures are stable. +static Py_ssize_t +get_all_mimalloc_used_kb(PyInterpreterState *interp) +{ + assert(interp->stoptheworld.world_stopped); + struct count_used_area_args args = {0}; + HEAD_LOCK(&_PyRuntime); + _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { + struct _mimalloc_thread_state *m = &((_PyThreadStateImpl *)p)->mimalloc; + if (!_Py_atomic_load_int(&m->initialized)) { + continue; } - if (rss_kb != -1 && swap_kb != -1) { - break; // Found both + for (int h = 0; h < _Py_MIMALLOC_HEAP_COUNT; h++) { + mi_heap_visit_blocks(&m->heaps[h], false, + count_used_area_visitor, &args); } } - fclose(fp); - - if (rss_kb != -1 && swap_kb != -1) { - return (Py_ssize_t)(rss_kb + swap_kb); - } - return -1; - -#elif defined(__APPLE__) - // --- MacOS (Darwin) --- - // Returns phys_footprint (RAM + compressed memory) - task_vm_info_data_t vm_info; - mach_msg_type_number_t count = TASK_VM_INFO_COUNT; - kern_return_t kerr; - - kerr = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count); - if (kerr != KERN_SUCCESS) { - return -1; - } - // phys_footprint is in bytes. Convert to KB. - return (Py_ssize_t)(vm_info.phys_footprint / 1024); - -#elif defined(__FreeBSD__) - // NOTE: Returns RSS only. Per-process swap usage isn't readily available - long page_size_kb = sysconf(_SC_PAGESIZE) / 1024; - if (page_size_kb <= 0) { - return -1; - } - - // Using /dev/null for vmcore avoids needing dump file. - // NULL for kernel file uses running kernel. - char errbuf[_POSIX2_LINE_MAX]; // For kvm error messages - kvm_t *kd = kvm_openfiles(NULL, "/dev/null", NULL, O_RDONLY, errbuf); - if (kd == NULL) { - return -1; - } - - // KERN_PROC_PID filters for the specific process ID - // n_procs will contain the number of processes returned (should be 1 or 0) - pid_t pid = getpid(); - int n_procs; - struct kinfo_proc *kp = kvm_getprocs(kd, KERN_PROC_PID, pid, &n_procs); - if (kp == NULL) { - kvm_close(kd); - return -1; - } - - Py_ssize_t rss_kb = -1; - if (n_procs > 0) { - // kp[0] contains the info for our process - // ki_rssize is in pages. Convert to KB. - rss_kb = (Py_ssize_t)kp->ki_rssize * page_size_kb; - } - else { - // Process with PID not found, shouldn't happen for self. - rss_kb = -1; - } - - kvm_close(kd); - return rss_kb; - -#elif defined(__OpenBSD__) - // NOTE: Returns RSS only. Per-process swap usage isn't readily available - long page_size_kb = sysconf(_SC_PAGESIZE) / 1024; - if (page_size_kb <= 0) { - return -1; - } - - struct kinfo_proc kp; - pid_t pid = getpid(); - int mib[6]; - size_t len = sizeof(kp); - - mib[0] = CTL_KERN; - mib[1] = KERN_PROC; - mib[2] = KERN_PROC_PID; - mib[3] = pid; - mib[4] = sizeof(struct kinfo_proc); // size of the structure we want - mib[5] = 1; // want 1 structure back - if (sysctl(mib, 6, &kp, &len, NULL, 0) == -1) { - return -1; - } - - if (len > 0) { - // p_vm_rssize is in pages on OpenBSD. Convert to KB. - return (Py_ssize_t)kp.p_vm_rssize * page_size_kb; - } - else { - // Process info not returned - return -1; - } -#else - // Unsupported platform - return -1; -#endif + mi_abandoned_pool_t *pool = &interp->mimalloc.abandoned_pool; + // Only GC page tags are supported by _mi_abandoned_pool_visit_blocks. + _mi_abandoned_pool_visit_blocks(pool, _Py_MIMALLOC_HEAP_GC, false, + count_used_area_visitor, &args); + _mi_abandoned_pool_visit_blocks(pool, _Py_MIMALLOC_HEAP_GC_PRE, false, + count_used_area_visitor, &args); + HEAD_UNLOCK(&_PyRuntime); + return args.total_bytes / 1024; } +// Decide whether memory usage has grown enough to warrant a collection. +// Stops the world to measure mimalloc heap usage accurately; OS-level RSS +// is unreliable since mimalloc reuses pages without returning them. static bool -gc_should_collect_mem_usage(GCState *gcstate) +gc_should_collect_mem_usage(PyThreadState *tstate) { - Py_ssize_t mem = get_process_mem_usage(); - if (mem < 0) { - // Reading process memory usage is not support or failed. - return true; - } + PyInterpreterState *interp = tstate->interp; + GCState *gcstate = &interp->gc; int threshold = gcstate->young.threshold; - Py_ssize_t deferred = _Py_atomic_load_ssize_relaxed(&gcstate->deferred_count); - if (deferred > threshold * 40) { - // Too many new container objects since last GC, even though memory use - // might not have increased much. This is intended to avoid resource - // exhaustion if some objects consume resources but don't result in a - // memory usage increase. We use 40x as the factor here because older - // versions of Python would do full collections after roughly every - // 70,000 new container objects. + + if (gcstate->deferred_count > threshold * 40) { + // Too many new container objects since last GC, even though memory + // use might not have increased much. This avoids resource + // exhaustion if some objects consume resources but don't result in + // a memory usage increase. We use 40x here because older versions + // of Python would do full collections after roughly every 70,000 + // new container objects. return true; } - Py_ssize_t last_mem = _Py_atomic_load_ssize_relaxed(&gcstate->last_mem); - Py_ssize_t mem_threshold = Py_MAX(last_mem / 10, 128); - if ((mem - last_mem) > mem_threshold) { - // The process memory usage has increased too much, do a collection. + _PyEval_StopTheWorld(interp); + Py_ssize_t used = get_all_mimalloc_used_kb(interp); + Py_ssize_t last = gcstate->last_gc_used; + Py_ssize_t mem_threshold = Py_MAX(last / 10, 128); + if ((used - last) > mem_threshold) { + // Heap usage has grown enough, collect. + _PyEval_StartTheWorld(interp); return true; } - else { - // The memory usage has not increased enough, defer the collection and - // clear the young object count so we don't check memory usage again - // on the next call to gc_should_collect(). - PyMutex_Lock(&gcstate->mutex); - int young_count = _Py_atomic_exchange_int(&gcstate->young.count, 0); - _Py_atomic_store_ssize_relaxed(&gcstate->deferred_count, - gcstate->deferred_count + young_count); - PyMutex_Unlock(&gcstate->mutex); - return false; - } + // Memory usage has not grown enough. Defer the collection, rolling the + // young count into deferred_count so we don't keep checking on every + // call to gc_should_collect(). + int young_count = gcstate->young.count; + gcstate->young.count = 0; + gcstate->deferred_count += young_count; + _PyEval_StartTheWorld(interp); + return false; } static bool -gc_should_collect(GCState *gcstate) +gc_should_collect(PyThreadState *tstate) { + GCState *gcstate = &tstate->interp->gc; int count = _Py_atomic_load_int_relaxed(&gcstate->young.count); int threshold = gcstate->young.threshold; int gc_enabled = _Py_atomic_load_int_relaxed(&gcstate->enabled); @@ -2214,7 +2096,7 @@ gc_should_collect(GCState *gcstate) // objects. return false; } - return gc_should_collect_mem_usage(gcstate); + return gc_should_collect_mem_usage(tstate); } static void @@ -2231,7 +2113,7 @@ record_allocation(PyThreadState *tstate) _Py_atomic_add_int(&gcstate->young.count, (int)gc->alloc_count); gc->alloc_count = 0; - if (gc_should_collect(gcstate) && + if (gc_should_collect(tstate) && !_Py_atomic_load_int_relaxed(&gcstate->collecting)) { _Py_ScheduleGC(tstate); @@ -2379,10 +2261,11 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, // to be freed. delete_garbage(state); - // Store the current memory usage, can be smaller now if breaking cycles - // freed some memory. - Py_ssize_t last_mem = get_process_mem_usage(); - _Py_atomic_store_ssize_relaxed(&state->gcstate->last_mem, last_mem); + // Record mimalloc heap usage as the baseline for the next collection's + // growth check. Stop-the-world so the heap structures are stable. + _PyEval_StopTheWorld(interp); + state->gcstate->last_gc_used = get_all_mimalloc_used_kb(interp); + _PyEval_StartTheWorld(interp); // Append objects with legacy finalizers to the "gc.garbage" list. handle_legacy_finalizers(state); @@ -2423,7 +2306,7 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) return 0; } - if (reason == _Py_GC_REASON_HEAP && !gc_should_collect(gcstate)) { + if (reason == _Py_GC_REASON_HEAP && !gc_should_collect(tstate)) { // Don't collect if the threshold is not exceeded. _Py_atomic_store_int(&gcstate->collecting, 0); return 0; From 946b042865f7b1c56adca2303196f24c277a4da2 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Thu, 23 Apr 2026 18:35:10 -0700 Subject: [PATCH 2/5] Move gc_should_collect_mem_usage() call. It's probably better to call this inside of gc_collect_main(). That way, we are not doing the STW from inside _PyObject_GC_Link() function. This should have no significant performance impact since we hit this only after the young object count hits the threshold. --- Python/gc_free_threading.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 25de084b1203ae..8f7ecab64e33ac 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -2048,6 +2048,11 @@ gc_should_collect_mem_usage(PyThreadState *tstate) GCState *gcstate = &interp->gc; int threshold = gcstate->young.threshold; + if (gcstate->old[0].threshold == 0) { + // A few tests rely on immediate scheduling of the GC so we ignore the + // extra conditions if generations[1].threshold is set to zero. + return true; + } if (gcstate->deferred_count > threshold * 40) { // Too many new container objects since last GC, even though memory // use might not have increased much. This avoids resource @@ -2096,7 +2101,7 @@ gc_should_collect(PyThreadState *tstate) // objects. return false; } - return gc_should_collect_mem_usage(tstate); + return true; } static void @@ -2311,6 +2316,10 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) _Py_atomic_store_int(&gcstate->collecting, 0); return 0; } + if (reason == _Py_GC_REASON_HEAP && !gc_should_collect_mem_usage(tstate)) { + _Py_atomic_store_int(&gcstate->collecting, 0); + return 0; + } gcstate->frame = tstate->current_frame; assert(generation >= 0 && generation < NUM_GENERATIONS); From b77864c8ff78871f563e65bf6fc09e93b5942c71 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Thu, 23 Apr 2026 18:39:45 -0700 Subject: [PATCH 3/5] Add blurb. --- .../2026-04-23-18-39-41.gh-issue-148937.yp--1l.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2026-04-23-18-39-41.gh-issue-148937.yp--1l.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-04-23-18-39-41.gh-issue-148937.yp--1l.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-23-18-39-41.gh-issue-148937.yp--1l.rst new file mode 100644 index 00000000000000..523792372bc8e5 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-23-18-39-41.gh-issue-148937.yp--1l.rst @@ -0,0 +1,3 @@ +Fix a bug in the free-threaded GC that caused collections to be deferred too +long. This would result in excess memory usage since cyclic trash was not +freed quickly enough. From a853c004d31d1f045926860b43d5e0065566d22d Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Wed, 29 Apr 2026 12:38:49 -0700 Subject: [PATCH 4/5] Compute mimalloc memory usage based on full pages. This avoids using STW in exchange for less accurate memory usage estimates. --- Include/internal/mimalloc/mimalloc/types.h | 4 + Include/internal/pycore_interp_structs.h | 9 +- Objects/mimalloc/init.c | 1 + Objects/mimalloc/page.c | 14 +++ Objects/obmalloc.c | 53 +++++++++ Python/gc_free_threading.c | 125 ++++++++------------- 6 files changed, 121 insertions(+), 85 deletions(-) diff --git a/Include/internal/mimalloc/mimalloc/types.h b/Include/internal/mimalloc/mimalloc/types.h index 286e7bf668312d..87ee42f67b27e5 100644 --- a/Include/internal/mimalloc/mimalloc/types.h +++ b/Include/internal/mimalloc/mimalloc/types.h @@ -516,6 +516,10 @@ typedef struct mi_abandoned_pool_s { // in order to prevent resetting/decommitting segment memory if it might // still be read. mi_decl_cache_align _Atomic(size_t) abandoned_readers; // = 0 + + // Total bytes (block_size * capacity) of pages currently in MI_BIN_FULL + // state whose pool association is this pool. + mi_decl_cache_align _Atomic(intptr_t) full_page_bytes; // = 0 } mi_abandoned_pool_t; diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index f97ec647b37c81..5851232c94a64b 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -281,14 +281,15 @@ struct _gc_runtime_state { /* True if gc.freeze() has been used. */ int freeze_active; - /* Sum of area->used*area->block_size across all mimalloc heaps after last - GC, in KB. Updated under stop-the-world so the measurement is accurate - even when OS pages are being reused. */ - Py_ssize_t last_gc_used; + /* Estimate of the number of bytes used by mimalloc after last GC. */ + Py_ssize_t last_heap_bytes; /* This accumulates the new object count whenever collection is deferred due to memory usage not increasing enough. Reset on collection. */ Py_ssize_t deferred_count; + + /* Mutex held for gc_should_collect_mem_usage(). */ + PyMutex mutex; #endif }; diff --git a/Objects/mimalloc/init.c b/Objects/mimalloc/init.c index 81b241063ff40f..64411bf1c77fdd 100644 --- a/Objects/mimalloc/init.c +++ b/Objects/mimalloc/init.c @@ -103,6 +103,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = { NULL, // next false, 0, + 0, 0 }; diff --git a/Objects/mimalloc/page.c b/Objects/mimalloc/page.c index ded59f8eb1ccaa..ca71246c988ca3 100644 --- a/Objects/mimalloc/page.c +++ b/Objects/mimalloc/page.c @@ -360,6 +360,10 @@ void _mi_page_unfull(mi_page_t* page) { mi_assert_internal(mi_page_is_in_full(page)); if (!mi_page_is_in_full(page)) return; +#ifdef Py_GIL_DISABLED + _PyMem_mi_page_full_dec(page); +#endif + mi_heap_t* heap = mi_page_heap(page); mi_page_queue_t* pqfull = &heap->pages[MI_BIN_FULL]; mi_page_set_in_full(page, false); // to get the right queue @@ -374,6 +378,9 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) { mi_assert_internal(!mi_page_is_in_full(page)); if (mi_page_is_in_full(page)) return; +#ifdef Py_GIL_DISABLED + _PyMem_mi_page_full_inc(page); +#endif mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page); _mi_page_free_collect(page,false); // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set } @@ -435,6 +442,13 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) { #ifdef Py_GIL_DISABLED mi_assert_internal(page->qsbr_goal == 0); mi_assert_internal(page->qsbr_node.next == NULL); + // Defensive: a full page whose last block is freed locally goes through + // _mi_page_retire -> _PyMem_mi_page_maybe_free -> _mi_page_free without + // ever calling _mi_page_unfull, so the per-thread full-page counter must + // be decremented here to maintain the invariant. + if (mi_page_is_in_full(page)) { + _PyMem_mi_page_full_dec(page); + } #endif // remove from the page list diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index e2d5b012955c3e..4f944855f4d97d 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -22,6 +22,8 @@ static bool _PyMem_mi_page_is_safe_to_free(mi_page_t *page); static bool _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force); static void _PyMem_mi_page_reclaimed(mi_page_t *page); static void _PyMem_mi_heap_collect_qsbr(mi_heap_t *heap); +static void _PyMem_mi_page_full_inc(mi_page_t *page); +static void _PyMem_mi_page_full_dec(mi_page_t *page); # include "pycore_mimalloc.h" # include "mimalloc/static.c" # include "mimalloc/internal.h" // for stats @@ -223,6 +225,57 @@ _PyMem_mi_page_reclaimed(mi_page_t *page) #endif } +// Hooks called from mimalloc page-state transitions to maintain +// mi_abandoned_pool_t::full_page_bytes -- bytes (block_size * capacity) of +// pages currently in MI_BIN_FULL state whose pool association is that pool. +// Page weight uses the same formula as should_advance_qsbr_for_page above; +// capacity is stable while a page is in the full queue (extend_free is only +// called on non-full queues), so inc and dec see the same value. +// +// The pool a page counts toward is heap->tld->segments.abandoned, which for a +// Python tstate-bound heap is &interp->mimalloc.abandoned_pool, and for +// mimalloc's auto-created default heap is _mi_abandoned_default. Pages do +// not cross pools (mimalloc reclaim only pulls from the reclaiming heap's +// own pool), so the counter stays valid across abandon/reclaim without any +// hand-off -- abandon and reclaim therefore have no hooks of their own. +// +// The hooks fire only on slow paths: mi_page_to_full / _mi_page_unfull / +// in-full _mi_page_free. gc_get_heap_bytes() in gc_free_threading.c reads the +// per-interp pool plus _mi_abandoned_default to get a stop-the-world-free +// memory-pressure proxy. +#ifdef Py_GIL_DISABLED +static inline Py_ssize_t +_PyMem_mi_page_size(mi_page_t *page) +{ + return (Py_ssize_t)(mi_page_block_size(page) * (size_t)page->capacity); +} + +static inline Py_ssize_t * +_PyMem_mi_page_pool_full_bytes(mi_page_t *page) +{ + return (Py_ssize_t *) + &mi_page_heap(page)->tld->segments.abandoned->full_page_bytes; +} +#endif + +static void +_PyMem_mi_page_full_inc(mi_page_t *page) +{ +#ifdef Py_GIL_DISABLED + _Py_atomic_add_ssize(_PyMem_mi_page_pool_full_bytes(page), + _PyMem_mi_page_size(page)); +#endif +} + +static void +_PyMem_mi_page_full_dec(mi_page_t *page) +{ +#ifdef Py_GIL_DISABLED + _Py_atomic_add_ssize(_PyMem_mi_page_pool_full_bytes(page), + -_PyMem_mi_page_size(page)); +#endif +} + static void _PyMem_mi_heap_collect_qsbr(mi_heap_t *heap) { diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 8f7ecab64e33ac..5ae23d875a60a6 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -17,7 +17,15 @@ #include "pydtrace.h" -#include "pycore_mimalloc.h" // mi_heap_visit_blocks() +// Minimum growth in mimalloc heap bytes (estimated from full pages) since the +// last GC. +#define GC_HEAP_BYTES_MIN_DELTA (512 * 1024) + +// Maximum number of "young" objects before we stop deferring collection due +// to heap not growing enough. With the default threshold, this is (40*2000) +// net new objects. This is set to 40x because older versions of Python would +// do full collections after roughly every 70,000 new container objects. +#define GC_MAX_DEFER_FACTOR 40 // enable the "mark alive" pass of GC #define GC_ENABLE_MARK_ALIVE 1 @@ -1993,92 +2001,53 @@ cleanup_worklist(struct worklist *worklist) } } -// Visitor for get_all_mimalloc_used_kb(): called once per heap area. -struct count_used_area_args { - Py_ssize_t total_bytes; -}; - -static bool -count_used_area_visitor(const mi_heap_t *heap, const mi_heap_area_t *area, - void *block, size_t block_size, void *arg) -{ - if (block == NULL) { - // Called once per area when visit_all_blocks=false. - ((struct count_used_area_args *)arg)->total_bytes += - (Py_ssize_t)(area->used * area->block_size); - } - return true; -} - -// Return the total bytes in use across all mimalloc heaps for all threads, in -// KB. Requires the world to be stopped so heap structures are stable. +// Return an estimate, in bytes, of how much memory is being used. static Py_ssize_t -get_all_mimalloc_used_kb(PyInterpreterState *interp) +gc_get_heap_bytes(PyInterpreterState *interp) { - assert(interp->stoptheworld.world_stopped); - struct count_used_area_args args = {0}; - HEAD_LOCK(&_PyRuntime); - _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { - struct _mimalloc_thread_state *m = &((_PyThreadStateImpl *)p)->mimalloc; - if (!_Py_atomic_load_int(&m->initialized)) { - continue; - } - for (int h = 0; h < _Py_MIMALLOC_HEAP_COUNT; h++) { - mi_heap_visit_blocks(&m->heaps[h], false, - count_used_area_visitor, &args); - } - } - mi_abandoned_pool_t *pool = &interp->mimalloc.abandoned_pool; - // Only GC page tags are supported by _mi_abandoned_pool_visit_blocks. - _mi_abandoned_pool_visit_blocks(pool, _Py_MIMALLOC_HEAP_GC, false, - count_used_area_visitor, &args); - _mi_abandoned_pool_visit_blocks(pool, _Py_MIMALLOC_HEAP_GC_PRE, false, - count_used_area_visitor, &args); - HEAD_UNLOCK(&_PyRuntime); - return args.total_bytes / 1024; + // Computed from mimalloc full-page byte counters maintained on each + // abandoned pool (see _PyMem_mi_page_full_inc/dec in Objects/obmalloc.c). + Py_ssize_t total = _Py_atomic_load_ssize_relaxed( + (Py_ssize_t *)&interp->mimalloc.abandoned_pool.full_page_bytes); + total += _Py_atomic_load_ssize_relaxed( + (Py_ssize_t *)&_mi_abandoned_default.full_page_bytes); + return total; } // Decide whether memory usage has grown enough to warrant a collection. -// Stops the world to measure mimalloc heap usage accurately; OS-level RSS -// is unreliable since mimalloc reuses pages without returning them. static bool gc_should_collect_mem_usage(PyThreadState *tstate) { PyInterpreterState *interp = tstate->interp; GCState *gcstate = &interp->gc; int threshold = gcstate->young.threshold; - - if (gcstate->old[0].threshold == 0) { - // A few tests rely on immediate scheduling of the GC so we ignore the - // extra conditions if generations[1].threshold is set to zero. - return true; - } - if (gcstate->deferred_count > threshold * 40) { + Py_ssize_t deferred = _Py_atomic_load_ssize_relaxed(&gcstate->deferred_count); + if (deferred > threshold * GC_MAX_DEFER_FACTOR) { // Too many new container objects since last GC, even though memory - // use might not have increased much. This avoids resource - // exhaustion if some objects consume resources but don't result in - // a memory usage increase. We use 40x here because older versions - // of Python would do full collections after roughly every 70,000 - // new container objects. + // use might not have increased much. This avoids resource exhaustion + // if some objects consume resources but don't result in a memory + // usage increase. return true; } - _PyEval_StopTheWorld(interp); - Py_ssize_t used = get_all_mimalloc_used_kb(interp); - Py_ssize_t last = gcstate->last_gc_used; - Py_ssize_t mem_threshold = Py_MAX(last / 10, 128); - if ((used - last) > mem_threshold) { - // Heap usage has grown enough, collect. - _PyEval_StartTheWorld(interp); + Py_ssize_t cur = gc_get_heap_bytes(interp); + Py_ssize_t last = _Py_atomic_load_ssize_relaxed(&gcstate->last_heap_bytes); + // Require 20% increase in full mimalloc pages. + Py_ssize_t delta = Py_MAX(last / 5, GC_HEAP_BYTES_MIN_DELTA); + if ((cur - last) > delta) { + // Heap has grown enough, collect. return true; } - // Memory usage has not grown enough. Defer the collection, rolling the - // young count into deferred_count so we don't keep checking on every - // call to gc_should_collect(). - int young_count = gcstate->young.count; - gcstate->young.count = 0; - gcstate->deferred_count += young_count; - _PyEval_StartTheWorld(interp); - return false; + else { + // Memory usage has not grown enough. Defer the collection, rolling the + // young count into deferred_count so we don't keep checking on every + // call to gc_should_collect(). + PyMutex_Lock(&gcstate->mutex); + int young_count = _Py_atomic_exchange_int(&gcstate->young.count, 0); + _Py_atomic_store_ssize_relaxed(&gcstate->deferred_count, + gcstate->deferred_count + young_count); + PyMutex_Unlock(&gcstate->mutex); + return false; + } } static bool @@ -2101,7 +2070,7 @@ gc_should_collect(PyThreadState *tstate) // objects. return false; } - return true; + return gc_should_collect_mem_usage(tstate); } static void @@ -2266,11 +2235,9 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, // to be freed. delete_garbage(state); - // Record mimalloc heap usage as the baseline for the next collection's - // growth check. Stop-the-world so the heap structures are stable. - _PyEval_StopTheWorld(interp); - state->gcstate->last_gc_used = get_all_mimalloc_used_kb(interp); - _PyEval_StartTheWorld(interp); + // Record the current heap bytes estimate as new baseline. + Py_ssize_t last_heap_bytes = gc_get_heap_bytes(interp); + _Py_atomic_store_ssize_relaxed(&state->gcstate->last_heap_bytes, last_heap_bytes); // Append objects with legacy finalizers to the "gc.garbage" list. handle_legacy_finalizers(state); @@ -2316,10 +2283,6 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) _Py_atomic_store_int(&gcstate->collecting, 0); return 0; } - if (reason == _Py_GC_REASON_HEAP && !gc_should_collect_mem_usage(tstate)) { - _Py_atomic_store_int(&gcstate->collecting, 0); - return 0; - } gcstate->frame = tstate->current_frame; assert(generation >= 0 && generation < NUM_GENERATIONS); From 819a848937447cfd123375bb14885d00f12ce775 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Thu, 30 Apr 2026 08:19:56 -0700 Subject: [PATCH 5/5] Avoid warning of unused functions. --- Objects/obmalloc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index 4f944855f4d97d..d41d4019124ed3 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -22,8 +22,10 @@ static bool _PyMem_mi_page_is_safe_to_free(mi_page_t *page); static bool _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force); static void _PyMem_mi_page_reclaimed(mi_page_t *page); static void _PyMem_mi_heap_collect_qsbr(mi_heap_t *heap); +#ifdef Py_GIL_DISABLED static void _PyMem_mi_page_full_inc(mi_page_t *page); static void _PyMem_mi_page_full_dec(mi_page_t *page); +#endif # include "pycore_mimalloc.h" # include "mimalloc/static.c" # include "mimalloc/internal.h" // for stats @@ -256,25 +258,21 @@ _PyMem_mi_page_pool_full_bytes(mi_page_t *page) return (Py_ssize_t *) &mi_page_heap(page)->tld->segments.abandoned->full_page_bytes; } -#endif static void _PyMem_mi_page_full_inc(mi_page_t *page) { -#ifdef Py_GIL_DISABLED _Py_atomic_add_ssize(_PyMem_mi_page_pool_full_bytes(page), _PyMem_mi_page_size(page)); -#endif } static void _PyMem_mi_page_full_dec(mi_page_t *page) { -#ifdef Py_GIL_DISABLED _Py_atomic_add_ssize(_PyMem_mi_page_pool_full_bytes(page), -_PyMem_mi_page_size(page)); -#endif } +#endif static void _PyMem_mi_heap_collect_qsbr(mi_heap_t *heap)