diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h index 1c56ad5af8072f..3d595e30714f6b 100644 --- a/Include/cpython/pystate.h +++ b/Include/cpython/pystate.h @@ -55,14 +55,14 @@ typedef struct _err_stackitem { } _PyErr_StackItem; typedef struct _stack_chunk { - struct _stack_chunk *previous; size_t size; - size_t top; - PyObject * data[1]; /* Variable sized */ + struct _stack_chunk *previous; + PyObject *data[1]; /* Variable sized */ } _PyStackChunk; -/* Minimum size of data stack chunk */ -#define _PY_DATA_STACK_CHUNK_SIZE (16*1024) +#define _PY_STACK_CHUNK_MIN_SIZE 4096 +#define _PY_STACK_CHUNK_OVERHEADS (offsetof(_PyStackChunk, data)) + struct _ts { /* See Python/ceval.c for comments explaining most fields */ @@ -195,10 +195,9 @@ struct _ts { /* Unique thread state id. */ uint64_t id; - _PyStackChunk *datastack_chunk; - PyObject **datastack_top; - PyObject **datastack_limit; - _PyStackChunk *datastack_cached_chunk; + _PyStackChunk *stack_chunk_list; + PyObject **stack_top; + PyObject **stack_limit; /* XXX signal handlers should also be here */ /* The following fields are here to avoid allocation during init. diff --git a/Include/internal/pycore_debug_offsets.h b/Include/internal/pycore_debug_offsets.h index c166f963da4f66..11868106dab142 100644 --- a/Include/internal/pycore_debug_offsets.h +++ b/Include/internal/pycore_debug_offsets.h @@ -106,7 +106,7 @@ typedef struct _Py_DebugOffsets { uint64_t last_profiled_frame; uint64_t thread_id; uint64_t native_thread_id; - uint64_t datastack_chunk; + uint64_t stack_chunk_list; uint64_t status; uint64_t holds_gil; uint64_t gil_requested; @@ -287,7 +287,7 @@ typedef struct _Py_DebugOffsets { .last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \ .thread_id = offsetof(PyThreadState, thread_id), \ .native_thread_id = offsetof(PyThreadState, native_thread_id), \ - .datastack_chunk = offsetof(PyThreadState, datastack_chunk), \ + .stack_chunk_list = offsetof(PyThreadState, stack_chunk_list), \ .status = offsetof(PyThreadState, _status), \ .holds_gil = offsetof(PyThreadState, holds_gil), \ .gil_requested = offsetof(PyThreadState, gil_requested), \ diff --git a/Include/internal/pycore_interpframe.h b/Include/internal/pycore_interpframe.h index d744dd12cd0479..0d5d1c84b3efe1 100644 --- a/Include/internal/pycore_interpframe.h +++ b/Include/internal/pycore_interpframe.h @@ -330,12 +330,12 @@ static inline bool _PyThreadState_HasStackSpace(PyThreadState *tstate, int size) { assert( - (tstate->datastack_top == NULL && tstate->datastack_limit == NULL) + (tstate->stack_top == NULL && tstate->stack_limit == NULL) || - (tstate->datastack_top != NULL && tstate->datastack_limit != NULL) + (tstate->stack_top != NULL && tstate->stack_limit != NULL) ); - return tstate->datastack_top != NULL && - size < tstate->datastack_limit - tstate->datastack_top; + return tstate->stack_top != NULL && + size < tstate->stack_limit - tstate->stack_top; } extern _PyInterpreterFrame * @@ -352,9 +352,9 @@ _PyFrame_PushUnchecked(PyThreadState *tstate, _PyStackRef func, int null_locals_ CALL_STAT_INC(frames_pushed); PyFunctionObject *func_obj = (PyFunctionObject *)PyStackRef_AsPyObjectBorrow(func); PyCodeObject *code = (PyCodeObject *)func_obj->func_code; - _PyInterpreterFrame *new_frame = (_PyInterpreterFrame *)tstate->datastack_top; - tstate->datastack_top += code->co_framesize; - assert(tstate->datastack_top < tstate->datastack_limit); + _PyInterpreterFrame *new_frame = (_PyInterpreterFrame *)tstate->stack_top; + tstate->stack_top += code->co_framesize; + assert(tstate->stack_top < tstate->stack_limit); _PyFrame_Initialize(tstate, new_frame, func, NULL, code, null_locals_from, previous); return new_frame; @@ -366,9 +366,9 @@ static inline _PyInterpreterFrame * _PyFrame_PushTrampolineUnchecked(PyThreadState *tstate, PyCodeObject *code, int stackdepth, _PyInterpreterFrame * previous) { CALL_STAT_INC(frames_pushed); - _PyInterpreterFrame *frame = (_PyInterpreterFrame *)tstate->datastack_top; - tstate->datastack_top += code->co_framesize; - assert(tstate->datastack_top < tstate->datastack_limit); + _PyInterpreterFrame *frame = (_PyInterpreterFrame *)tstate->stack_top; + tstate->stack_top += code->co_framesize; + assert(tstate->stack_top < tstate->stack_limit); frame->previous = previous; frame->f_funcobj = PyStackRef_None; frame->f_executable = PyStackRef_FromPyObjectNew(code); @@ -404,6 +404,54 @@ PyAPI_FUNC(_PyInterpreterFrame *) _PyEvalFramePushAndInit_Ex(PyThreadState *tstate, _PyStackRef func, PyObject *locals, Py_ssize_t nargs, PyObject *callargs, PyObject *kwargs, _PyInterpreterFrame *previous); +static inline bool +ptr_in_chunk(const char *ptr, const _PyStackChunk *chunk) +{ + assert(chunk != NULL); + const char *start = (char *)&chunk->data[0]; + const intptr_t offset = ptr - start; + const intptr_t usable_size = (intptr_t)(chunk->size - _PY_STACK_CHUNK_OVERHEADS); + return offset >= 0 && offset < usable_size && start + offset == ptr; +} + +static inline uintptr_t +get_offset_in_chunk(const char *ptr, const _PyStackChunk *chunk) +{ + assert(chunk != NULL); + assert(chunk->data != NULL); + assert(ptr_in_chunk(ptr, chunk)); + + return ptr - (char *)chunk; +} + +static inline uintptr_t +get_offset_in_chunk_list(char *base, _PyStackChunk *stack_chunk_list) +{ + assert(stack_chunk_list != NULL); + assert(base != NULL); + _PyStackChunk *chunk = stack_chunk_list; + do { + if (ptr_in_chunk(base, chunk)) { + return get_offset_in_chunk(base, chunk); + } + chunk = chunk->previous; + } while (chunk); + assert(false); // did not find correct chunk + Py_UNREACHABLE(); +} + +static inline void * +_Py_ensure_frame_in_current_stack_chunk(PyThreadState *tstate, char *frame) +{ + assert(tstate != NULL); + assert(frame != NULL); + if (ptr_in_chunk(frame, tstate->stack_chunk_list)) { + return frame; + } + uintptr_t offset = get_offset_in_chunk_list(frame, tstate->stack_chunk_list->previous); + return ((char *)tstate->stack_chunk_list) + offset; +} + #ifdef __cplusplus } #endif diff --git a/Include/internal/pycore_runtime_init.h b/Include/internal/pycore_runtime_init.h index e8d1098c2078fc..c9506bf340164b 100644 --- a/Include/internal/pycore_runtime_init.h +++ b/Include/internal/pycore_runtime_init.h @@ -179,6 +179,9 @@ extern PyTypeObject _PyExc_MemoryError; ._whence = _PyThreadState_WHENCE_NOTSET, \ .py_recursion_limit = Py_DEFAULT_RECURSION_LIMIT, \ .context_ver = 1, \ + .stack_chunk_list = NULL, \ + .stack_limit = NULL, \ + .stack_top = NULL, \ } diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-04-17-12-27-33.gh-issue-142183.jKFUky.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-17-12-27-33.gh-issue-142183.jKFUky.rst new file mode 100644 index 00000000000000..7be9d1cf8ae1eb --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-17-12-27-33.gh-issue-142183.jKFUky.rst @@ -0,0 +1 @@ +Change the resizing implementation of the Python stack to use a resizable array instead of a chain of arrays, to avoid degenerate performance cases and allow further JIT optimizations. diff --git a/Modules/_remote_debugging/debug_offsets_validation.h b/Modules/_remote_debugging/debug_offsets_validation.h index 32800e767b3169..73572c3875d3cd 100644 --- a/Modules/_remote_debugging/debug_offsets_validation.h +++ b/Modules/_remote_debugging/debug_offsets_validation.h @@ -252,7 +252,7 @@ validate_fixed_field( #define PY_REMOTE_DEBUG_THREAD_STATE_FIELDS(APPLY, buffer_size) \ APPLY(thread_state, native_thread_id, sizeof(unsigned long), _Alignof(long), buffer_size); \ APPLY(thread_state, interp, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \ - APPLY(thread_state, datastack_chunk, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \ + APPLY(thread_state, stack_chunk_list, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \ APPLY(thread_state, status, FIELD_SIZE(PyThreadState, _status), _Alignof(unsigned int), buffer_size); \ APPLY(thread_state, holds_gil, sizeof(int), _Alignof(int), buffer_size); \ APPLY(thread_state, gil_requested, sizeof(int), _Alignof(int), buffer_size); \ diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c index bbdfce3f7201d9..133ae36ff994d6 100644 --- a/Modules/_remote_debugging/frames.c +++ b/Modules/_remote_debugging/frames.c @@ -27,7 +27,7 @@ process_single_stack_chunk( StackChunkInfo *chunk_info ) { // Start with default size assumption - size_t current_size = _PY_DATA_STACK_CHUNK_SIZE; + size_t current_size = _PY_STACK_CHUNK_MIN_SIZE; char *this_chunk = PyMem_RawMalloc(current_size); if (!this_chunk) { @@ -87,11 +87,17 @@ copy_stack_chunks(RemoteUnwinderObject *unwinder, size_t count = 0; size_t max_chunks = 16; - if (read_ptr(unwinder, tstate_addr + (uintptr_t)unwinder->debug_offsets.thread_state.datastack_chunk, &chunk_addr)) { + if (read_ptr(unwinder, tstate_addr + (uintptr_t)unwinder->debug_offsets.thread_state.stack_chunk_list, &chunk_addr)) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read initial stack chunk address"); return -1; } + if (!chunk_addr) { + out_chunks->chunks = NULL; + out_chunks->count = 0; + return 0; + } + chunks = PyMem_RawMalloc(max_chunks * sizeof(StackChunkInfo)); if (!chunks) { PyErr_NoMemory(); @@ -99,33 +105,14 @@ copy_stack_chunks(RemoteUnwinderObject *unwinder, return -1; } - const size_t MAX_STACK_CHUNKS = 4096; - while (chunk_addr != 0 && count < MAX_STACK_CHUNKS) { - // Grow array if needed - if (count >= max_chunks) { - max_chunks *= 2; - StackChunkInfo *new_chunks = PyMem_RawRealloc(chunks, max_chunks * sizeof(StackChunkInfo)); - if (!new_chunks) { - PyErr_NoMemory(); - set_exception_cause(unwinder, PyExc_MemoryError, "Failed to grow stack chunks array"); - goto error; - } - chunks = new_chunks; - } - - // Process this chunk - if (process_single_stack_chunk(unwinder, chunk_addr, &chunks[count]) < 0) { - set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process stack chunk"); - goto error; - } - - // Get next chunk address and increment count - chunk_addr = GET_MEMBER(uintptr_t, chunks[count].local_copy, offsetof(_PyStackChunk, previous)); - count++; + // Process this chunk + if (process_single_stack_chunk(unwinder, chunk_addr, &chunks[count]) < 0) { + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process stack chunk"); + goto error; } out_chunks->chunks = chunks; - out_chunks->count = count; + out_chunks->count = 1; return 0; error: diff --git a/Python/ceval.c b/Python/ceval.c index ead0df31c44266..5d29ce7cb5c293 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1925,8 +1925,13 @@ clear_thread_frame(PyThreadState *tstate, _PyInterpreterFrame * frame) assert(frame->owner == FRAME_OWNED_BY_THREAD); // Make sure that this is, indeed, the top frame. We can't check this in // _PyThreadState_PopFrame, since f_code is already cleared at that point: - assert((PyObject **)frame + _PyFrame_GetCode(frame)->co_framesize == - tstate->datastack_top); + assert( + _Py_ensure_frame_in_current_stack_chunk( // the frame might be in a previous stack chunk + tstate, + (char *)((PyObject **)frame + _PyFrame_GetCode(frame)->co_framesize) + ) + == tstate->stack_top + ); assert(frame->frame_obj == NULL || frame->frame_obj->f_frame == frame); _PyFrame_ClearExceptCode(frame); PyStackRef_CLEAR(frame->f_executable); diff --git a/Python/pystate.c b/Python/pystate.c index 3f539a4c2551ba..bf85e7404dd833 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -1433,17 +1433,16 @@ tstate_is_alive(PyThreadState *tstate) // lifecycle //---------- -static _PyStackChunk* -allocate_chunk(int size_in_bytes, _PyStackChunk* previous) +static _PyStackChunk * +allocate_stack_chunk(size_t size_in_bytes) { assert(size_in_bytes % sizeof(PyObject **) == 0); _PyStackChunk *res = _PyObject_VirtualAlloc(size_in_bytes); if (res == NULL) { return NULL; } - res->previous = previous; + res->previous = NULL; res->size = size_in_bytes; - res->top = 0; return res; } @@ -1572,10 +1571,9 @@ init_threadstate(_PyThreadStateImpl *_tstate, tstate->current_frame = &_tstate->base_frame; // base_frame pointer for profilers to validate stack unwinding tstate->base_frame = &_tstate->base_frame; - tstate->datastack_chunk = NULL; - tstate->datastack_top = NULL; - tstate->datastack_limit = NULL; - tstate->datastack_cached_chunk = NULL; + tstate->stack_chunk_list = NULL; + tstate->stack_top = NULL; + tstate->stack_limit = NULL; tstate->what_event = -1; tstate->current_executor = NULL; tstate->jit_exit = NULL; @@ -1717,20 +1715,17 @@ _PyThreadState_Init(PyThreadState *tstate) static void -clear_datastack(PyThreadState *tstate) +clear_stack_chunk_list(PyThreadState *tstate) { - _PyStackChunk *chunk = tstate->datastack_chunk; - tstate->datastack_chunk = NULL; - while (chunk != NULL) { - _PyStackChunk *prev = chunk->previous; - _PyObject_VirtualFree(chunk, chunk->size); - chunk = prev; - } - if (tstate->datastack_cached_chunk != NULL) { - _PyObject_VirtualFree(tstate->datastack_cached_chunk, - tstate->datastack_cached_chunk->size); - tstate->datastack_cached_chunk = NULL; + assert(tstate != NULL); + _PyStackChunk *chunk_list = tstate->stack_chunk_list; + while (chunk_list != NULL) { + _PyStackChunk *prev = chunk_list->previous; + size_t size = chunk_list->size; + _PyObject_VirtualFree(chunk_list, size); + chunk_list = prev; } + tstate->stack_chunk_list = NULL; } void @@ -1932,7 +1927,7 @@ tstate_delete_common(PyThreadState *tstate, int release_gil) } // XXX Move to PyThreadState_Clear()? - clear_datastack(tstate); + clear_stack_chunk_list(tstate); if (release_gil) { _PyEval_ReleaseLock(tstate->interp, tstate, 1); @@ -3052,42 +3047,44 @@ _PyInterpreterState_HasFeature(PyInterpreterState *interp, unsigned long feature return ((interp->feature_flags & feature) != 0); } - -#define MINIMUM_OVERHEAD 1000 +#define MINIMUM_OVERHEAD 0 static PyObject ** -push_chunk(PyThreadState *tstate, int size) -{ - int allocate_size = _PY_DATA_STACK_CHUNK_SIZE; - while (allocate_size < (int)sizeof(PyObject*)*(size + MINIMUM_OVERHEAD)) { +resize_stack(PyThreadState *tstate, int size) +{ + size_t current_size, allocate_size; + _PyStackChunk *old = tstate->stack_chunk_list; + if (old) { + current_size = tstate->stack_chunk_list->size; + assert(current_size > 0); + allocate_size = current_size * 2; + } else { + current_size = 0; + allocate_size = _PY_STACK_CHUNK_MIN_SIZE; + } + assert(allocate_size > current_size); + assert(allocate_size - current_size > _PY_STACK_CHUNK_OVERHEADS); + size_t required_space = sizeof(PyObject *) * (size + MINIMUM_OVERHEAD) + _PY_STACK_CHUNK_OVERHEADS; + while (allocate_size < required_space) { allocate_size *= 2; } - _PyStackChunk *new; - if (tstate->datastack_cached_chunk != NULL - && (size_t)allocate_size <= tstate->datastack_cached_chunk->size) - { - new = tstate->datastack_cached_chunk; - tstate->datastack_cached_chunk = NULL; - new->previous = tstate->datastack_chunk; - new->top = 0; - } - else { - new = allocate_chunk(allocate_size, tstate->datastack_chunk); - if (new == NULL) { - return NULL; - } + assert(allocate_size > 0); + _PyStackChunk *new = allocate_stack_chunk(allocate_size); + if (new == NULL) { + return NULL; } - if (tstate->datastack_chunk) { - tstate->datastack_chunk->top = tstate->datastack_top - - &tstate->datastack_chunk->data[0]; - } - tstate->datastack_chunk = new; - tstate->datastack_limit = (PyObject **)(((char *)new) + allocate_size); - // When new is the "root" chunk (i.e. new->previous == NULL), we can keep - // _PyThreadState_PopFrame from freeing it later by "skipping" over the - // first element: - PyObject **res = &new->data[new->previous == NULL]; - tstate->datastack_top = res + size; + if (old) { + new->previous = old; + long current_stack_size = tstate->stack_top - &old->data[0]; + assert(current_stack_size > 0); + tstate->stack_top = &new->data[current_stack_size]; + } else { + tstate->stack_top = &new->data[0]; + } + tstate->stack_chunk_list = new; + tstate->stack_limit = (PyObject **)(((char *)new) + allocate_size); + PyObject **res = tstate->stack_top; + tstate->stack_top = res + size; return res; } @@ -3096,38 +3093,22 @@ _PyThreadState_PushFrame(PyThreadState *tstate, size_t size) { assert(size < INT_MAX/sizeof(PyObject *)); if (_PyThreadState_HasStackSpace(tstate, (int)size)) { - _PyInterpreterFrame *res = (_PyInterpreterFrame *)tstate->datastack_top; - tstate->datastack_top += size; + _PyInterpreterFrame *res = (_PyInterpreterFrame *)tstate->stack_top; + tstate->stack_top += size; return res; } - return (_PyInterpreterFrame *)push_chunk(tstate, (int)size); + return (_PyInterpreterFrame *) resize_stack(tstate, (int) size); } void -_PyThreadState_PopFrame(PyThreadState *tstate, _PyInterpreterFrame * frame) -{ - assert(tstate->datastack_chunk); - PyObject **base = (PyObject **)frame; - if (base == &tstate->datastack_chunk->data[0]) { - _PyStackChunk *chunk = tstate->datastack_chunk; - _PyStackChunk *previous = chunk->previous; - _PyStackChunk *cached = tstate->datastack_cached_chunk; - // push_chunk ensures that the root chunk is never popped: - assert(previous); - tstate->datastack_top = &previous->data[previous->top]; - tstate->datastack_chunk = previous; - tstate->datastack_limit = (PyObject **)(((char *)previous) + previous->size); - chunk->previous = NULL; - if (cached != NULL) { - _PyObject_VirtualFree(cached, cached->size); - } - tstate->datastack_cached_chunk = chunk; - } - else { - assert(tstate->datastack_top); - assert(tstate->datastack_top >= base); - tstate->datastack_top = base; - } +_PyThreadState_PopFrame(PyThreadState *tstate, _PyInterpreterFrame *frame) +{ + assert(tstate->stack_chunk_list); + PyObject **base = (PyObject **)_Py_ensure_frame_in_current_stack_chunk(tstate, (char *)frame); + assert(ptr_in_chunk((char *)base, tstate->stack_chunk_list)); + assert(tstate->stack_top); + assert(tstate->stack_top >= base); + tstate->stack_top = base; }