Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Include/cpython/pystats.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ typedef struct _optimization_stats {
uint64_t unknown_callee;
uint64_t trace_immediately_deopts;
uint64_t executors_invalidated;
uint64_t fitness_terminated_traces;
UOpStats opcode[PYSTATS_MAX_UOP_ID + 1];
uint64_t unsupported_opcode[256];
uint64_t trace_length_hist[_Py_UOP_HIST_SIZE];
Expand Down
3 changes: 3 additions & 0 deletions Include/internal/pycore_interp_structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,9 @@ typedef struct _PyOptimizationConfig {
uint16_t side_exit_initial_value;
uint16_t side_exit_initial_backoff;

// Trace fitness thresholds
uint16_t fitness_initial;

// Optimization flags
bool specialization_enabled;
bool uops_optimize_enabled;
Expand Down
47 changes: 46 additions & 1 deletion Include/internal/pycore_optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,50 @@ extern "C" {
#include "pycore_optimizer_types.h"
#include <stdbool.h>

/* Fitness controls how long a trace can grow.
* Starts at FITNESS_INITIAL, then decreases from per-bytecode buffer usage
* plus branch/frame heuristics. The trace stops when fitness drops below the
* current exit_quality.
*
* Design targets for the constants below:
* 1. Reaching the abstract frame-depth limit should drop fitness below
* EXIT_QUALITY_SPECIALIZABLE.
* 2. A backward edge should leave budget for roughly N_BACKWARD_SLACK more
* bytecodes, assuming AVG_SLOTS_PER_INSTRUCTION.
* 3. Roughly seven balanced branches should reduce fitness to
* EXIT_QUALITY_DEFAULT after per-slot costs.
* 4. A push followed by a matching return is net-zero on frame-specific
* fitness, excluding per-slot costs.
*/
#define MAX_TARGET_LENGTH (UOP_MAX_TRACE_LENGTH / 2)
#define OPTIMIZER_EFFECTIVENESS 2
#define FITNESS_INITIAL (MAX_TARGET_LENGTH * OPTIMIZER_EFFECTIVENESS)

/* Exit quality thresholds: trace stops when fitness < exit_quality.
* Higher = trace is more willing to stop here. */
#define EXIT_QUALITY_CLOSE_LOOP (FITNESS_INITIAL - AVG_SLOTS_PER_INSTRUCTION*4)
#define EXIT_QUALITY_ENTER_EXECUTOR (FITNESS_INITIAL * 1 / 8)
#define EXIT_QUALITY_DEFAULT (FITNESS_INITIAL / 40)
#define EXIT_QUALITY_SPECIALIZABLE (FITNESS_INITIAL / 80)

/* Estimated buffer slots per bytecode, used only to derive heuristics.
* Runtime charging uses trace-buffer capacity consumed for each bytecode. */
#define AVG_SLOTS_PER_INSTRUCTION 6

/* Heuristic backward-edge exit quality: leave room for about 1 unroll and
* N_BACKWARD_SLACK more bytecodes before reaching EXIT_QUALITY_CLOSE_LOOP,
* based on AVG_SLOTS_PER_INSTRUCTION. */
#define N_BACKWARD_SLACK 10
#define EXIT_QUALITY_BACKWARD_EDGE (EXIT_QUALITY_CLOSE_LOOP / 2 - N_BACKWARD_SLACK * AVG_SLOTS_PER_INSTRUCTION)

/* Penalty for a balanced branch.
* It is sized so repeated balanced branches can drive a trace toward
* EXIT_QUALITY_DEFAULT, while compute_branch_penalty() keeps any single branch
* from dominating the budget.
*/
#define FITNESS_BRANCH_BALANCED ((FITNESS_INITIAL - EXIT_QUALITY_DEFAULT - \
(MAX_TARGET_LENGTH / 14 * AVG_SLOTS_PER_INSTRUCTION)) / (14))


typedef struct _PyJitUopBuffer {
_PyUOpInstruction *start;
Expand Down Expand Up @@ -103,7 +147,8 @@ typedef struct _PyJitTracerPreviousState {
} _PyJitTracerPreviousState;

typedef struct _PyJitTracerTranslatorState {
int jump_backward_seen;
int32_t fitness; // Current trace fitness, starts high, decrements
int frame_depth; // Current inline depth (0 = root frame)
} _PyJitTracerTranslatorState;

typedef struct _PyJitTracerState {
Expand Down
8 changes: 6 additions & 2 deletions Lib/test/test_capi/test_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -1427,9 +1427,13 @@ def testfunc(n):
for _ in gen(n):
pass
testfunc(TIER2_THRESHOLD * 2)
# The generator may be inlined into testfunc's trace,
# so check whichever executor contains _YIELD_VALUE.
gen_ex = get_first_executor(gen)
self.assertIsNotNone(gen_ex)
uops = get_opnames(gen_ex)
testfunc_ex = get_first_executor(testfunc)
ex = gen_ex or testfunc_ex
self.assertIsNotNone(ex)
uops = get_opnames(ex)
self.assertNotIn("_MAKE_HEAP_SAFE", uops)
self.assertIn("_YIELD_VALUE", uops)

Expand Down
7 changes: 5 additions & 2 deletions Modules/_testinternalcapi/test_cases.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 5 additions & 2 deletions Python/bytecodes.c
Original file line number Diff line number Diff line change
Expand Up @@ -3529,7 +3529,7 @@ dummy_func(
int og_oparg = (oparg & ~255) | executor->vm_data.oparg;
next_instr = this_instr;
if (_PyJit_EnterExecutorShouldStopTracing(og_opcode)) {
if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]) {
if (_PyOpcode_Caches[_PyOpcode_Deopt[og_opcode]]) {
PAUSE_ADAPTIVE_COUNTER(this_instr[1].counter);
}
opcode = og_opcode;
Expand Down Expand Up @@ -6541,7 +6541,10 @@ dummy_func(
tracer->prev_state.instr_frame = frame;
tracer->prev_state.instr_oparg = oparg;
tracer->prev_state.instr_stacklevel = PyStackRef_IsNone(frame->f_executable) ? 2 : STACK_LEVEL();
if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]) {
if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]]
// Branch opcodes use the cache for branch history, not
// specialization counters. Don't reset it.
&& !IS_CONDITIONAL_JUMP_OPCODE(opcode)) {
(&next_instr[1])->counter = trigger_backoff_counter();
}

Expand Down
7 changes: 5 additions & 2 deletions Python/generated_cases.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading