diff --git a/Include/internal/mimalloc/mimalloc/types.h b/Include/internal/mimalloc/mimalloc/types.h index 286e7bf668312d..87ee42f67b27e5 100644 --- a/Include/internal/mimalloc/mimalloc/types.h +++ b/Include/internal/mimalloc/mimalloc/types.h @@ -516,6 +516,10 @@ typedef struct mi_abandoned_pool_s { // in order to prevent resetting/decommitting segment memory if it might // still be read. mi_decl_cache_align _Atomic(size_t) abandoned_readers; // = 0 + + // Total bytes (block_size * capacity) of pages currently in MI_BIN_FULL + // state whose pool association is this pool. + mi_decl_cache_align _Atomic(intptr_t) full_page_bytes; // = 0 } mi_abandoned_pool_t; diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index fb810c82a5aa63..5851232c94a64b 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -281,11 +281,11 @@ struct _gc_runtime_state { /* True if gc.freeze() has been used. */ int freeze_active; - /* Memory usage of the process (RSS + swap) after last GC. */ - Py_ssize_t last_mem; + /* Estimate of the number of bytes used by mimalloc after last GC. */ + Py_ssize_t last_heap_bytes; /* This accumulates the new object count whenever collection is deferred - due to the RSS increase condition not being meet. Reset on collection. */ + due to memory usage not increasing enough. Reset on collection. */ Py_ssize_t deferred_count; /* Mutex held for gc_should_collect_mem_usage(). */ diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-04-23-18-39-41.gh-issue-148937.yp--1l.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-23-18-39-41.gh-issue-148937.yp--1l.rst new file mode 100644 index 00000000000000..523792372bc8e5 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-23-18-39-41.gh-issue-148937.yp--1l.rst @@ -0,0 +1,3 @@ +Fix a bug in the free-threaded GC that caused collections to be deferred too +long. This would result in excess memory usage since cyclic trash was not +freed quickly enough. diff --git a/Objects/mimalloc/init.c b/Objects/mimalloc/init.c index 81b241063ff40f..64411bf1c77fdd 100644 --- a/Objects/mimalloc/init.c +++ b/Objects/mimalloc/init.c @@ -103,6 +103,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = { NULL, // next false, 0, + 0, 0 }; diff --git a/Objects/mimalloc/page.c b/Objects/mimalloc/page.c index ded59f8eb1ccaa..ca71246c988ca3 100644 --- a/Objects/mimalloc/page.c +++ b/Objects/mimalloc/page.c @@ -360,6 +360,10 @@ void _mi_page_unfull(mi_page_t* page) { mi_assert_internal(mi_page_is_in_full(page)); if (!mi_page_is_in_full(page)) return; +#ifdef Py_GIL_DISABLED + _PyMem_mi_page_full_dec(page); +#endif + mi_heap_t* heap = mi_page_heap(page); mi_page_queue_t* pqfull = &heap->pages[MI_BIN_FULL]; mi_page_set_in_full(page, false); // to get the right queue @@ -374,6 +378,9 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) { mi_assert_internal(!mi_page_is_in_full(page)); if (mi_page_is_in_full(page)) return; +#ifdef Py_GIL_DISABLED + _PyMem_mi_page_full_inc(page); +#endif mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page); _mi_page_free_collect(page,false); // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set } @@ -435,6 +442,13 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) { #ifdef Py_GIL_DISABLED mi_assert_internal(page->qsbr_goal == 0); mi_assert_internal(page->qsbr_node.next == NULL); + // Defensive: a full page whose last block is freed locally goes through + // _mi_page_retire -> _PyMem_mi_page_maybe_free -> _mi_page_free without + // ever calling _mi_page_unfull, so the per-thread full-page counter must + // be decremented here to maintain the invariant. + if (mi_page_is_in_full(page)) { + _PyMem_mi_page_full_dec(page); + } #endif // remove from the page list diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index e2d5b012955c3e..d41d4019124ed3 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -22,6 +22,10 @@ static bool _PyMem_mi_page_is_safe_to_free(mi_page_t *page); static bool _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force); static void _PyMem_mi_page_reclaimed(mi_page_t *page); static void _PyMem_mi_heap_collect_qsbr(mi_heap_t *heap); +#ifdef Py_GIL_DISABLED +static void _PyMem_mi_page_full_inc(mi_page_t *page); +static void _PyMem_mi_page_full_dec(mi_page_t *page); +#endif # include "pycore_mimalloc.h" # include "mimalloc/static.c" # include "mimalloc/internal.h" // for stats @@ -223,6 +227,53 @@ _PyMem_mi_page_reclaimed(mi_page_t *page) #endif } +// Hooks called from mimalloc page-state transitions to maintain +// mi_abandoned_pool_t::full_page_bytes -- bytes (block_size * capacity) of +// pages currently in MI_BIN_FULL state whose pool association is that pool. +// Page weight uses the same formula as should_advance_qsbr_for_page above; +// capacity is stable while a page is in the full queue (extend_free is only +// called on non-full queues), so inc and dec see the same value. +// +// The pool a page counts toward is heap->tld->segments.abandoned, which for a +// Python tstate-bound heap is &interp->mimalloc.abandoned_pool, and for +// mimalloc's auto-created default heap is _mi_abandoned_default. Pages do +// not cross pools (mimalloc reclaim only pulls from the reclaiming heap's +// own pool), so the counter stays valid across abandon/reclaim without any +// hand-off -- abandon and reclaim therefore have no hooks of their own. +// +// The hooks fire only on slow paths: mi_page_to_full / _mi_page_unfull / +// in-full _mi_page_free. gc_get_heap_bytes() in gc_free_threading.c reads the +// per-interp pool plus _mi_abandoned_default to get a stop-the-world-free +// memory-pressure proxy. +#ifdef Py_GIL_DISABLED +static inline Py_ssize_t +_PyMem_mi_page_size(mi_page_t *page) +{ + return (Py_ssize_t)(mi_page_block_size(page) * (size_t)page->capacity); +} + +static inline Py_ssize_t * +_PyMem_mi_page_pool_full_bytes(mi_page_t *page) +{ + return (Py_ssize_t *) + &mi_page_heap(page)->tld->segments.abandoned->full_page_bytes; +} + +static void +_PyMem_mi_page_full_inc(mi_page_t *page) +{ + _Py_atomic_add_ssize(_PyMem_mi_page_pool_full_bytes(page), + _PyMem_mi_page_size(page)); +} + +static void +_PyMem_mi_page_full_dec(mi_page_t *page) +{ + _Py_atomic_add_ssize(_PyMem_mi_page_pool_full_bytes(page), + -_PyMem_mi_page_size(page)); +} +#endif + static void _PyMem_mi_heap_collect_qsbr(mi_heap_t *heap) { diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 4b46ca04f56b20..5ae23d875a60a6 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -17,30 +17,15 @@ #include "pydtrace.h" -// Platform-specific includes for get_process_mem_usage(). -#ifdef _WIN32 - #include - #include // For GetProcessMemoryInfo -#elif defined(__linux__) - #include // For sysconf, getpid -#elif defined(__APPLE__) - #include - #include // Required for TASK_VM_INFO - #include // For sysconf, getpid -#elif defined(__FreeBSD__) - #include - #include - #include // Requires sys/user.h for kinfo_proc definition - #include - #include // For sysconf, getpid - #include // For O_RDONLY - #include // For _POSIX2_LINE_MAX -#elif defined(__OpenBSD__) - #include - #include - #include // For kinfo_proc - #include // For sysconf, getpid -#endif +// Minimum growth in mimalloc heap bytes (estimated from full pages) since the +// last GC. +#define GC_HEAP_BYTES_MIN_DELTA (512 * 1024) + +// Maximum number of "young" objects before we stop deferring collection due +// to heap not growing enough. With the default threshold, this is (40*2000) +// net new objects. This is set to 40x because older versions of Python would +// do full collections after roughly every 70,000 new container objects. +#define GC_MAX_DEFER_FACTOR 40 // enable the "mark alive" pass of GC #define GC_ENABLE_MARK_ALIVE 1 @@ -2016,176 +2001,46 @@ cleanup_worklist(struct worklist *worklist) } } -// Return the memory usage (typically RSS + swap) of the process, in units of -// KB. Returns -1 if this operation is not supported or on failure. +// Return an estimate, in bytes, of how much memory is being used. static Py_ssize_t -get_process_mem_usage(void) -{ -#ifdef _WIN32 - // Windows implementation using GetProcessMemoryInfo - // Returns WorkingSetSize + PagefileUsage - PROCESS_MEMORY_COUNTERS pmc; - HANDLE hProcess = GetCurrentProcess(); - if (NULL == hProcess) { - // Should not happen for the current process - return -1; - } - - // GetProcessMemoryInfo returns non-zero on success - if (GetProcessMemoryInfo(hProcess, &pmc, sizeof(pmc))) { - // Values are in bytes, convert to KB. - return (Py_ssize_t)((pmc.WorkingSetSize + pmc.PagefileUsage) / 1024); - } - else { - return -1; - } - -#elif __linux__ - FILE* fp = fopen("/proc/self/status", "r"); - if (fp == NULL) { - return -1; - } - - char line_buffer[256]; - long long rss_kb = -1; - long long swap_kb = -1; - - while (fgets(line_buffer, sizeof(line_buffer), fp) != NULL) { - if (rss_kb == -1 && strncmp(line_buffer, "VmRSS:", 6) == 0) { - sscanf(line_buffer + 6, "%lld", &rss_kb); - } - else if (swap_kb == -1 && strncmp(line_buffer, "VmSwap:", 7) == 0) { - sscanf(line_buffer + 7, "%lld", &swap_kb); - } - if (rss_kb != -1 && swap_kb != -1) { - break; // Found both - } - } - fclose(fp); - - if (rss_kb != -1 && swap_kb != -1) { - return (Py_ssize_t)(rss_kb + swap_kb); - } - return -1; - -#elif defined(__APPLE__) - // --- MacOS (Darwin) --- - // Returns phys_footprint (RAM + compressed memory) - task_vm_info_data_t vm_info; - mach_msg_type_number_t count = TASK_VM_INFO_COUNT; - kern_return_t kerr; - - kerr = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count); - if (kerr != KERN_SUCCESS) { - return -1; - } - // phys_footprint is in bytes. Convert to KB. - return (Py_ssize_t)(vm_info.phys_footprint / 1024); - -#elif defined(__FreeBSD__) - // NOTE: Returns RSS only. Per-process swap usage isn't readily available - long page_size_kb = sysconf(_SC_PAGESIZE) / 1024; - if (page_size_kb <= 0) { - return -1; - } - - // Using /dev/null for vmcore avoids needing dump file. - // NULL for kernel file uses running kernel. - char errbuf[_POSIX2_LINE_MAX]; // For kvm error messages - kvm_t *kd = kvm_openfiles(NULL, "/dev/null", NULL, O_RDONLY, errbuf); - if (kd == NULL) { - return -1; - } - - // KERN_PROC_PID filters for the specific process ID - // n_procs will contain the number of processes returned (should be 1 or 0) - pid_t pid = getpid(); - int n_procs; - struct kinfo_proc *kp = kvm_getprocs(kd, KERN_PROC_PID, pid, &n_procs); - if (kp == NULL) { - kvm_close(kd); - return -1; - } - - Py_ssize_t rss_kb = -1; - if (n_procs > 0) { - // kp[0] contains the info for our process - // ki_rssize is in pages. Convert to KB. - rss_kb = (Py_ssize_t)kp->ki_rssize * page_size_kb; - } - else { - // Process with PID not found, shouldn't happen for self. - rss_kb = -1; - } - - kvm_close(kd); - return rss_kb; - -#elif defined(__OpenBSD__) - // NOTE: Returns RSS only. Per-process swap usage isn't readily available - long page_size_kb = sysconf(_SC_PAGESIZE) / 1024; - if (page_size_kb <= 0) { - return -1; - } - - struct kinfo_proc kp; - pid_t pid = getpid(); - int mib[6]; - size_t len = sizeof(kp); - - mib[0] = CTL_KERN; - mib[1] = KERN_PROC; - mib[2] = KERN_PROC_PID; - mib[3] = pid; - mib[4] = sizeof(struct kinfo_proc); // size of the structure we want - mib[5] = 1; // want 1 structure back - if (sysctl(mib, 6, &kp, &len, NULL, 0) == -1) { - return -1; - } - - if (len > 0) { - // p_vm_rssize is in pages on OpenBSD. Convert to KB. - return (Py_ssize_t)kp.p_vm_rssize * page_size_kb; - } - else { - // Process info not returned - return -1; - } -#else - // Unsupported platform - return -1; -#endif +gc_get_heap_bytes(PyInterpreterState *interp) +{ + // Computed from mimalloc full-page byte counters maintained on each + // abandoned pool (see _PyMem_mi_page_full_inc/dec in Objects/obmalloc.c). + Py_ssize_t total = _Py_atomic_load_ssize_relaxed( + (Py_ssize_t *)&interp->mimalloc.abandoned_pool.full_page_bytes); + total += _Py_atomic_load_ssize_relaxed( + (Py_ssize_t *)&_mi_abandoned_default.full_page_bytes); + return total; } +// Decide whether memory usage has grown enough to warrant a collection. static bool -gc_should_collect_mem_usage(GCState *gcstate) +gc_should_collect_mem_usage(PyThreadState *tstate) { - Py_ssize_t mem = get_process_mem_usage(); - if (mem < 0) { - // Reading process memory usage is not support or failed. - return true; - } + PyInterpreterState *interp = tstate->interp; + GCState *gcstate = &interp->gc; int threshold = gcstate->young.threshold; Py_ssize_t deferred = _Py_atomic_load_ssize_relaxed(&gcstate->deferred_count); - if (deferred > threshold * 40) { - // Too many new container objects since last GC, even though memory use - // might not have increased much. This is intended to avoid resource - // exhaustion if some objects consume resources but don't result in a - // memory usage increase. We use 40x as the factor here because older - // versions of Python would do full collections after roughly every - // 70,000 new container objects. + if (deferred > threshold * GC_MAX_DEFER_FACTOR) { + // Too many new container objects since last GC, even though memory + // use might not have increased much. This avoids resource exhaustion + // if some objects consume resources but don't result in a memory + // usage increase. return true; } - Py_ssize_t last_mem = _Py_atomic_load_ssize_relaxed(&gcstate->last_mem); - Py_ssize_t mem_threshold = Py_MAX(last_mem / 10, 128); - if ((mem - last_mem) > mem_threshold) { - // The process memory usage has increased too much, do a collection. + Py_ssize_t cur = gc_get_heap_bytes(interp); + Py_ssize_t last = _Py_atomic_load_ssize_relaxed(&gcstate->last_heap_bytes); + // Require 20% increase in full mimalloc pages. + Py_ssize_t delta = Py_MAX(last / 5, GC_HEAP_BYTES_MIN_DELTA); + if ((cur - last) > delta) { + // Heap has grown enough, collect. return true; } else { - // The memory usage has not increased enough, defer the collection and - // clear the young object count so we don't check memory usage again - // on the next call to gc_should_collect(). + // Memory usage has not grown enough. Defer the collection, rolling the + // young count into deferred_count so we don't keep checking on every + // call to gc_should_collect(). PyMutex_Lock(&gcstate->mutex); int young_count = _Py_atomic_exchange_int(&gcstate->young.count, 0); _Py_atomic_store_ssize_relaxed(&gcstate->deferred_count, @@ -2196,8 +2051,9 @@ gc_should_collect_mem_usage(GCState *gcstate) } static bool -gc_should_collect(GCState *gcstate) +gc_should_collect(PyThreadState *tstate) { + GCState *gcstate = &tstate->interp->gc; int count = _Py_atomic_load_int_relaxed(&gcstate->young.count); int threshold = gcstate->young.threshold; int gc_enabled = _Py_atomic_load_int_relaxed(&gcstate->enabled); @@ -2214,7 +2070,7 @@ gc_should_collect(GCState *gcstate) // objects. return false; } - return gc_should_collect_mem_usage(gcstate); + return gc_should_collect_mem_usage(tstate); } static void @@ -2231,7 +2087,7 @@ record_allocation(PyThreadState *tstate) _Py_atomic_add_int(&gcstate->young.count, (int)gc->alloc_count); gc->alloc_count = 0; - if (gc_should_collect(gcstate) && + if (gc_should_collect(tstate) && !_Py_atomic_load_int_relaxed(&gcstate->collecting)) { _Py_ScheduleGC(tstate); @@ -2379,10 +2235,9 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, // to be freed. delete_garbage(state); - // Store the current memory usage, can be smaller now if breaking cycles - // freed some memory. - Py_ssize_t last_mem = get_process_mem_usage(); - _Py_atomic_store_ssize_relaxed(&state->gcstate->last_mem, last_mem); + // Record the current heap bytes estimate as new baseline. + Py_ssize_t last_heap_bytes = gc_get_heap_bytes(interp); + _Py_atomic_store_ssize_relaxed(&state->gcstate->last_heap_bytes, last_heap_bytes); // Append objects with legacy finalizers to the "gc.garbage" list. handle_legacy_finalizers(state); @@ -2423,7 +2278,7 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) return 0; } - if (reason == _Py_GC_REASON_HEAP && !gc_should_collect(gcstate)) { + if (reason == _Py_GC_REASON_HEAP && !gc_should_collect(tstate)) { // Don't collect if the threshold is not exceeded. _Py_atomic_store_int(&gcstate->collecting, 0); return 0;