diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index cf01c620476ff7..e55d859e5555b8 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -41,6 +41,9 @@ typedef struct _JitOptContext { // Arena for the symbolic types. ty_arena t_arena; + // Pool to store promoted constants to be used at runtime. + PyObject *constant_pool; + /* To do -- We could make this more space efficient * by using a single array and growing the stack and * locals toward each other. */ @@ -146,6 +149,7 @@ typedef struct _PyExitData { typedef struct _PyExecutorObject { PyObject_VAR_HEAD const _PyUOpInstruction *trace; + PyObject *constant_pool; _PyVMData vm_data; /* Used by the VM, but opaque to the optimizer */ uint32_t exit_count; uint32_t code_size; @@ -262,7 +266,8 @@ PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyInterpreterState *interp); int _Py_uop_analyze_and_optimize( _PyThreadStateImpl *tstate, _PyUOpInstruction *input, int trace_len, int curr_stackentries, - _PyUOpInstruction *output, _PyBloomFilter *dependencies); + _PyUOpInstruction *output, _PyBloomFilter *dependencies, + PyObject **constant_pool_ptr); extern PyTypeObject _PyUOpExecutor_Type; @@ -427,8 +432,9 @@ extern JitOptRef *_Py_uop_sym_set_stack_depth(JitOptContext *ctx, int stack_dept extern uint32_t _Py_uop_sym_get_func_version(JitOptRef ref); bool _Py_uop_sym_set_func_version(JitOptContext *ctx, JitOptRef ref, uint32_t version); -extern void _Py_uop_abstractcontext_init(JitOptContext *ctx, _PyBloomFilter *dependencies); +extern int _Py_uop_abstractcontext_init(JitOptContext *ctx, _PyBloomFilter *dependencies); extern void _Py_uop_abstractcontext_fini(JitOptContext *ctx); +extern int _Py_uop_promote_to_constant_pool(JitOptContext *ctx, PyObject *obj); extern _Py_UOpsAbstractFrame *_Py_uop_frame_new( JitOptContext *ctx, diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index f11413cc625422..640861717b2c40 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -4229,8 +4229,7 @@ def testfunc(n): self.assertIsNotNone(ex) uops = get_opnames(ex) - # For now... until we constant propagate it away. - self.assertIn("_BINARY_OP", uops) + self.assertIn("_LOAD_CONST_INLINE_BORROW", uops) def test_jitted_code_sees_changed_globals(self): "Issue 136154: Check that jitted code spots the change in the globals" diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-11-03-21-10-54.gh-issue-140928.purSIt.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-03-21-10-54.gh-issue-140928.purSIt.rst new file mode 100644 index 00000000000000..b1f4a784ef57ac --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-11-03-21-10-54.gh-issue-140928.purSIt.rst @@ -0,0 +1 @@ +Add a constant pool to all JIT executors and allow promotion of constants to the pool. Patch by Ken Jin. Implementation in CPython inspired by PyPy/RPython. diff --git a/Python/optimizer.c b/Python/optimizer.c index f09bf778587b12..ffc0799d0892cf 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -114,7 +114,8 @@ insert_executor(PyCodeObject *code, _Py_CODEUNIT *instr, int index, _PyExecutorO #endif // Py_GIL_DISABLED static _PyExecutorObject * -make_executor_from_uops(_PyThreadStateImpl *tstate, _PyUOpInstruction *buffer, int length, const _PyBloomFilter *dependencies); +make_executor_from_uops(_PyThreadStateImpl *tstate, _PyUOpInstruction *buffer, + int length, const _PyBloomFilter *dependencies, PyObject *constant_pool); static int uop_optimize(_PyInterpreterFrame *frame, PyThreadState *tstate, @@ -440,6 +441,7 @@ static int executor_traverse(PyObject *o, visitproc visit, void *arg) { _PyExecutorObject *executor = _PyExecutorObject_CAST(o); + Py_VISIT(executor->constant_pool); for (uint32_t i = 0; i < executor->exit_count; i++) { Py_VISIT(executor->exits[i].executor); } @@ -1286,13 +1288,15 @@ prepare_for_execution(_PyUOpInstruction *buffer, int length) /* Executor side exits */ static _PyExecutorObject * -allocate_executor(int exit_count, int length) +allocate_executor(int exit_count, int length, PyObject *constant_pool) { int size = exit_count*sizeof(_PyExitData) + length*sizeof(_PyUOpInstruction); _PyExecutorObject *res = PyObject_GC_NewVar(_PyExecutorObject, &_PyUOpExecutor_Type, size); if (res == NULL) { return NULL; } + // Transfer ownership + res->constant_pool = constant_pool; res->trace = (_PyUOpInstruction *)(res->exits + exit_count); res->code_size = length; res->exit_count = exit_count; @@ -1373,10 +1377,11 @@ sanity_check(_PyExecutorObject *executor) * and not a NOP. */ static _PyExecutorObject * -make_executor_from_uops(_PyThreadStateImpl *tstate, _PyUOpInstruction *buffer, int length, const _PyBloomFilter *dependencies) +make_executor_from_uops(_PyThreadStateImpl *tstate, _PyUOpInstruction *buffer, + int length, const _PyBloomFilter *dependencies, PyObject *constant_pool) { int exit_count = count_exits(buffer, length); - _PyExecutorObject *executor = allocate_executor(exit_count, length); + _PyExecutorObject *executor = allocate_executor(exit_count, length, constant_pool); if (executor == NULL) { return NULL; } @@ -1522,6 +1527,7 @@ uop_optimize( _PyExecutorObject **exec_ptr, bool progress_needed) { + PyObject *constant_pool = NULL; _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; assert(_tstate->jit_tracer_state != NULL); _PyUOpInstruction *buffer = _tstate->jit_tracer_state->code_buffer.start; @@ -1542,7 +1548,7 @@ uop_optimize( _PyUOpInstruction *output = &_tstate->jit_tracer_state->uop_array[UOP_MAX_TRACE_LENGTH]; length = _Py_uop_analyze_and_optimize( _tstate, buffer, length, curr_stackentries, - output, &dependencies); + output, &dependencies, &constant_pool); if (length <= 0) { return length; @@ -1580,7 +1586,7 @@ uop_optimize( length = prepare_for_execution(buffer, length); assert(length <= UOP_MAX_TRACE_LENGTH); _PyExecutorObject *executor = make_executor_from_uops( - _tstate, buffer, length, &dependencies); + _tstate, buffer, length, &dependencies, constant_pool); if (executor == NULL) { return -1; } @@ -1663,7 +1669,7 @@ _Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_s static _PyExecutorObject * make_cold_executor(uint16_t opcode) { - _PyExecutorObject *cold = allocate_executor(0, 1); + _PyExecutorObject *cold = allocate_executor(0, 1, NULL); if (cold == NULL) { Py_FatalError("Cannot allocate core JIT code"); } @@ -1752,6 +1758,7 @@ executor_invalidate(PyObject *op) } executor->vm_data.valid = 0; unlink_executor(executor); + Py_CLEAR(executor->constant_pool); executor_clear_exits(executor); _Py_ExecutorDetach(executor); _PyObject_GC_UNTRACK(op); diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index ca9bcc8a40c35e..ac30c2c2222940 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -237,6 +237,8 @@ add_op(JitOptContext *ctx, _PyUOpInstruction *this_instr, ctx->out_buffer.next++; } +#define sym_promote_to_constant_pool _Py_uop_promote_to_constant_pool + /* Shortened forms for convenience, used in optimizer_bytecodes.c */ #define sym_is_not_null _Py_uop_sym_is_not_null #define sym_is_const _Py_uop_sym_is_const @@ -557,7 +559,8 @@ optimize_uops( int trace_len, int curr_stacklen, _PyUOpInstruction *output, - _PyBloomFilter *dependencies + _PyBloomFilter *dependencies, + PyObject **constant_pool_ptr ) { assert(!PyErr_Occurred()); @@ -576,9 +579,15 @@ optimize_uops( interp->type_watchers[TYPE_WATCHER_ID] = type_watcher_callback; } - _Py_uop_abstractcontext_init(ctx, dependencies); + if (_Py_uop_abstractcontext_init(ctx, dependencies)) { + assert(PyErr_Occurred()); + PyErr_Clear(); + return 0; + } + _Py_UOpsAbstractFrame *frame = _Py_uop_frame_new(ctx, (PyCodeObject *)func->func_code, NULL, 0); if (frame == NULL) { + _Py_uop_abstractcontext_fini(ctx); return 0; } frame->func = func; @@ -652,6 +661,7 @@ optimize_uops( /* Either reached the end or cannot optimize further, but there * would be no benefit in retrying later */ + *constant_pool_ptr = Py_NewRef(ctx->constant_pool); _Py_uop_abstractcontext_fini(ctx); // Check that the trace ends with a proper terminator if (uop_buffer_length(&ctx->out_buffer) > 0) { @@ -788,13 +798,15 @@ _Py_uop_analyze_and_optimize( int length, int curr_stacklen, _PyUOpInstruction *output, - _PyBloomFilter *dependencies + _PyBloomFilter *dependencies, + PyObject **constant_pool_ptr ) { OPT_STAT_INC(optimizer_attempts); length = optimize_uops( - tstate, buffer, length, curr_stacklen, output, dependencies); + tstate, buffer, length, curr_stacklen, output, + dependencies, constant_pool_ptr); if (length == 0) { return length; diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index 2db2c87cb3610b..a9a1db2e8be0a6 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -236,7 +236,7 @@ res = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(res_stackref)); if (sym_is_const(ctx, res)) { PyObject *result = sym_get_const(ctx, res); - if (_Py_IsImmortal(result)) { + if (_Py_uop_promote_to_constant_pool(ctx, result) == 0) { // Replace with _LOAD_CONST_INLINE_BORROW + _SWAP since we have one input and an immortal result ADD_OP(_LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)result); ADD_OP(_SWAP, 2, 0); @@ -305,7 +305,7 @@ res = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(res_stackref)); if (sym_is_const(ctx, res)) { PyObject *result = sym_get_const(ctx, res); - if (_Py_IsImmortal(result)) { + if (_Py_uop_promote_to_constant_pool(ctx, result) == 0) { // Replace with _POP_TOP + _LOAD_CONST_INLINE_BORROW since we have one input and an immortal result ADD_OP(_POP_TOP, 0, 0); ADD_OP(_LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)result); @@ -523,7 +523,7 @@ res = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(res_stackref)); if (sym_is_const(ctx, res)) { PyObject *result = sym_get_const(ctx, res); - if (_Py_IsImmortal(result)) { + if (_Py_uop_promote_to_constant_pool(ctx, result) == 0) { // Replace with _LOAD_CONST_INLINE_BORROW + _SWAP since we have one input and an immortal result ADD_OP(_LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)result); ADD_OP(_SWAP, 2, 0); @@ -637,7 +637,7 @@ res = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(res_stackref)); if (sym_is_const(ctx, res)) { PyObject *result = sym_get_const(ctx, res); - if (_Py_IsImmortal(result)) { + if (_Py_uop_promote_to_constant_pool(ctx, result) == 0) { // Replace with _LOAD_CONST_INLINE_BORROW + _SWAP + _SWAP since we have two inputs and an immortal result ADD_OP(_LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)result); ADD_OP(_SWAP, 3, 0); @@ -709,7 +709,7 @@ res = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(res_stackref)); if (sym_is_const(ctx, res)) { PyObject *result = sym_get_const(ctx, res); - if (_Py_IsImmortal(result)) { + if (_Py_uop_promote_to_constant_pool(ctx, result) == 0) { // Replace with _LOAD_CONST_INLINE_BORROW + _SWAP + _SWAP since we have two inputs and an immortal result ADD_OP(_LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)result); ADD_OP(_SWAP, 3, 0); @@ -781,7 +781,7 @@ res = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(res_stackref)); if (sym_is_const(ctx, res)) { PyObject *result = sym_get_const(ctx, res); - if (_Py_IsImmortal(result)) { + if (_Py_uop_promote_to_constant_pool(ctx, result) == 0) { // Replace with _LOAD_CONST_INLINE_BORROW + _SWAP + _SWAP since we have two inputs and an immortal result ADD_OP(_LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)result); ADD_OP(_SWAP, 3, 0); @@ -1508,7 +1508,7 @@ res = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(res_stackref)); if (sym_is_const(ctx, res)) { PyObject *result = sym_get_const(ctx, res); - if (_Py_IsImmortal(result)) { + if (_Py_uop_promote_to_constant_pool(ctx, result) == 0) { // Replace with _LOAD_CONST_INLINE_BORROW + _SWAP + _SWAP since we have two inputs and an immortal result ADD_OP(_LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)result); ADD_OP(_SWAP, 3, 0); @@ -2705,7 +2705,7 @@ res = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(res_stackref)); if (sym_is_const(ctx, res)) { PyObject *result = sym_get_const(ctx, res); - if (_Py_IsImmortal(result)) { + if (_Py_uop_promote_to_constant_pool(ctx, result) == 0) { // Replace with _POP_TOP + _POP_TOP + _LOAD_CONST_INLINE_BORROW since we have two inputs and an immortal result ADD_OP(_POP_TOP, 0, 0); ADD_OP(_POP_TOP, 0, 0); @@ -2778,7 +2778,7 @@ res = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(res_stackref)); if (sym_is_const(ctx, res)) { PyObject *result = sym_get_const(ctx, res); - if (_Py_IsImmortal(result)) { + if (_Py_uop_promote_to_constant_pool(ctx, result) == 0) { // Replace with _LOAD_CONST_INLINE_BORROW + _SWAP + _SWAP since we have two inputs and an immortal result ADD_OP(_LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)result); ADD_OP(_SWAP, 3, 0); @@ -2853,7 +2853,7 @@ res = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(res_stackref)); if (sym_is_const(ctx, res)) { PyObject *result = sym_get_const(ctx, res); - if (_Py_IsImmortal(result)) { + if (_Py_uop_promote_to_constant_pool(ctx, result) == 0) { // Replace with _LOAD_CONST_INLINE_BORROW + _SWAP + _SWAP since we have two inputs and an immortal result ADD_OP(_LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)result); ADD_OP(_SWAP, 3, 0); @@ -2917,7 +2917,7 @@ res = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(res_stackref)); if (sym_is_const(ctx, res)) { PyObject *result = sym_get_const(ctx, res); - if (_Py_IsImmortal(result)) { + if (_Py_uop_promote_to_constant_pool(ctx, result) == 0) { // Replace with _LOAD_CONST_INLINE_BORROW + _SWAP + _SWAP since we have two inputs and an immortal result ADD_OP(_LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)result); ADD_OP(_SWAP, 3, 0); @@ -2999,7 +2999,7 @@ b = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(b_stackref)); if (sym_is_const(ctx, b)) { PyObject *result = sym_get_const(ctx, b); - if (_Py_IsImmortal(result)) { + if (_Py_uop_promote_to_constant_pool(ctx, result) == 0) { // Replace with _LOAD_CONST_INLINE_BORROW + _SWAP + _SWAP since we have two inputs and an immortal result ADD_OP(_LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)result); ADD_OP(_SWAP, 3, 0); @@ -3108,7 +3108,7 @@ b = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(b_stackref)); if (sym_is_const(ctx, b)) { PyObject *result = sym_get_const(ctx, b); - if (_Py_IsImmortal(result)) { + if (_Py_uop_promote_to_constant_pool(ctx, result) == 0) { // Replace with _LOAD_CONST_INLINE_BORROW + _SWAP + _SWAP since we have two inputs and an immortal result ADD_OP(_LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)result); ADD_OP(_SWAP, 3, 0); @@ -4742,7 +4742,7 @@ res = sym_new_const_steal(ctx, PyStackRef_AsPyObjectSteal(res_stackref)); if (sym_is_const(ctx, res)) { PyObject *result = sym_get_const(ctx, res); - if (_Py_IsImmortal(result)) { + if (_Py_uop_promote_to_constant_pool(ctx, result) == 0) { // Replace with _LOAD_CONST_INLINE_BORROW + _SWAP + _SWAP since we have two inputs and an immortal result ADD_OP(_LOAD_CONST_INLINE_BORROW, 0, (uintptr_t)result); ADD_OP(_SWAP, 3, 0); diff --git a/Python/optimizer_symbols.c b/Python/optimizer_symbols.c index d0f33b80a570dd..de5ded1f33172d 100644 --- a/Python/optimizer_symbols.c +++ b/Python/optimizer_symbols.c @@ -1604,12 +1604,13 @@ _Py_uop_abstractcontext_fini(JitOptContext *ctx) Py_CLEAR(sym->value.value); } } + Py_CLEAR(ctx->constant_pool); } // Leave a bit of space to push values before checking that there is space for a new frame #define STACK_HEADROOM 2 -void +int _Py_uop_abstractcontext_init(JitOptContext *ctx, _PyBloomFilter *dependencies) { static_assert(sizeof(JitOptSymbol) <= 3 * sizeof(uint64_t), "JitOptSymbol has grown"); @@ -1642,6 +1643,20 @@ _Py_uop_abstractcontext_init(JitOptContext *ctx, _PyBloomFilter *dependencies) ctx->contradiction = false; ctx->builtins_watched = false; ctx->dependencies = dependencies; + + ctx->constant_pool = PyList_New(0); + if (ctx->constant_pool == NULL) { + return -1; + } + + return 0; +} + +// -1 if err, 0 if success +int +_Py_uop_promote_to_constant_pool(JitOptContext *ctx, PyObject *obj) +{ + return PyList_Append(ctx->constant_pool, obj); } int @@ -1712,7 +1727,9 @@ _Py_uop_symbols_test(PyObject *Py_UNUSED(self), PyObject *Py_UNUSED(ignored)) { JitOptContext context; JitOptContext *ctx = &context; - _Py_uop_abstractcontext_init(ctx, NULL); + if (_Py_uop_abstractcontext_init(ctx, NULL) < 0) { + return NULL; + } PyObject *val_42 = NULL; PyObject *val_43 = NULL; PyObject *val_big = NULL; diff --git a/Tools/cases_generator/optimizer_generator.py b/Tools/cases_generator/optimizer_generator.py index b6b59838a70501..1c41d12682c59e 100644 --- a/Tools/cases_generator/optimizer_generator.py +++ b/Tools/cases_generator/optimizer_generator.py @@ -279,7 +279,7 @@ def replace_opcode_if_evaluates_pure( emitter.emit(f"if (sym_is_const(ctx, {output_identifier.text})) {{\n") emitter.emit(f"PyObject *result = sym_get_const(ctx, {output_identifier.text});\n") - emitter.emit(f"if (_Py_IsImmortal(result)) {{\n") + emitter.emit(f"if (_Py_uop_promote_to_constant_pool(ctx, result) == 0) {{\n") emitter.emit(f"// Replace with {ops_desc} since we have {input_desc} and an immortal result\n") for op, args in ops: emitter.emit(f"ADD_OP({op}, {args});\n")