From ab41a347ebc7e6c3e3f5795c4a24545bfbf92a6e Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 23 Apr 2026 11:52:13 +0200 Subject: [PATCH 1/3] gh-146636: Improve ABI/feature selection, add new header for it (GH-148302) Improve ABI/feature selection, add new header for it. Add a test that Python headers themselves don't use Py_GIL_DISABLED in abi3t: abi3 and abi3t ought to be the same except the _Py_OPAQUE_PYOBJECT differences. This is done using the GCC-only poison pragma. Co-authored-by: Victor Stinner --- Include/Python.h | 22 ++-- Include/exports.h | 8 +- Include/patchlevel.h | 28 ---- Include/pyabi.h | 121 ++++++++++++++++++ Include/pyport.h | 39 ------ Lib/test/test_cext/setup.py | 5 + Makefile.pre.in | 1 + ...-04-09-14-45-44.gh-issue-148267.p84kG_.rst | 2 + PCbuild/pythoncore.vcxproj | 1 + PCbuild/pythoncore.vcxproj.filters | 3 + 10 files changed, 148 insertions(+), 82 deletions(-) create mode 100644 Include/pyabi.h create mode 100644 Misc/NEWS.d/next/C_API/2026-04-09-14-45-44.gh-issue-148267.p84kG_.rst diff --git a/Include/Python.h b/Include/Python.h index e6e5cab67e2045..8b76195b320998 100644 --- a/Include/Python.h +++ b/Include/Python.h @@ -9,10 +9,11 @@ // is not needed. -// Include Python header files -#include "patchlevel.h" -#include "pyconfig.h" -#include "pymacconfig.h" +// Include Python configuration headers +#include "patchlevel.h" // the Python version +#include "pyconfig.h" // information from configure +#include "pymacconfig.h" // overrides for pyconfig +#include "pyabi.h" // feature/ABI selection // Include standard header files @@ -46,13 +47,11 @@ # endif #endif -#if defined(Py_GIL_DISABLED) -# if defined(_MSC_VER) -# include // __readgsqword() -# endif - -# if defined(__MINGW32__) -# include // __readgsqword() +#if !defined(Py_LIMITED_API) +# if defined(Py_GIL_DISABLED) +# if defined(_MSC_VER) || defined(__MINGW32__) +# include // __readgsqword() +# endif # endif #endif // Py_GIL_DISABLED @@ -67,6 +66,7 @@ __pragma(warning(disable: 4201)) // Include Python header files #include "pyport.h" +#include "exports.h" #include "pymacro.h" #include "pymath.h" #include "pymem.h" diff --git a/Include/exports.h b/Include/exports.h index 97a674ec2403a4..a863ecb33078ab 100644 --- a/Include/exports.h +++ b/Include/exports.h @@ -36,7 +36,7 @@ #define Py_LOCAL_SYMBOL #endif /* module init functions outside the core must be exported */ - #if defined(Py_BUILD_CORE) + #if defined(_PyEXPORTS_CORE) #define _PyINIT_EXPORTED_SYMBOL Py_EXPORTED_SYMBOL #else #define _PyINIT_EXPORTED_SYMBOL __declspec(dllexport) @@ -64,13 +64,13 @@ /* only get special linkage if built as shared or platform is Cygwin */ #if defined(Py_ENABLE_SHARED) || defined(__CYGWIN__) # if defined(HAVE_DECLSPEC_DLL) -# if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) +# if defined(_PyEXPORTS_CORE) && !defined(_PyEXPORTS_CORE_MODULE) /* module init functions inside the core need no external linkage */ /* except for Cygwin to handle embedding */ # if !defined(__CYGWIN__) # define _PyINIT_FUNC_DECLSPEC # endif /* __CYGWIN__ */ -# else /* Py_BUILD_CORE */ +# else /* _PyEXPORTS_CORE */ /* Building an extension module, or an embedded situation */ /* public Python functions and data are imported */ /* Under Cygwin, auto-import functions to prevent compilation */ @@ -80,7 +80,7 @@ # define PyAPI_FUNC(RTYPE) Py_IMPORTED_SYMBOL RTYPE # endif /* !__CYGWIN__ */ # define PyAPI_DATA(RTYPE) extern Py_IMPORTED_SYMBOL RTYPE -# endif /* Py_BUILD_CORE */ +# endif /* _PyEXPORTS_CORE */ # endif /* HAVE_DECLSPEC_DLL */ #endif /* Py_ENABLE_SHARED */ diff --git a/Include/patchlevel.h b/Include/patchlevel.h index 9f5c36230a7e45..974246f896e10b 100644 --- a/Include/patchlevel.h +++ b/Include/patchlevel.h @@ -61,32 +61,4 @@ #define PYTHON_ABI_VERSION 3 #define PYTHON_ABI_STRING "3" - -/* Stable ABI for free-threaded builds (introduced in PEP 803) - is enabled by one of: - - Py_TARGET_ABI3T, or - - Py_LIMITED_API and Py_GIL_DISABLED. - "Output" macros to be used internally: - - Py_LIMITED_API (defines the subset of API we expose) - - _Py_OPAQUE_PYOBJECT (additionally hides what's ABI-incompatible between - free-threaded & GIL) - (Don't use Py_TARGET_ABI3T directly: it's currently only used to set these - 2 macros. It's also available for users' convenience.) - */ -#if defined(Py_LIMITED_API) && defined(Py_GIL_DISABLED) \ - && !defined(Py_TARGET_ABI3T) -# define Py_TARGET_ABI3T Py_LIMITED_API -#endif -#if defined(Py_TARGET_ABI3T) -# define _Py_OPAQUE_PYOBJECT -# if !defined(Py_LIMITED_API) -# define Py_LIMITED_API Py_TARGET_ABI3T -# elif Py_LIMITED_API > Py_TARGET_ABI3T - // if both are defined, use the *lower* version, - // i.e. maximum compatibility -# undef Py_LIMITED_API -# define Py_LIMITED_API Py_TARGET_ABI3T -# endif -#endif - #endif //_Py_PATCHLEVEL_H diff --git a/Include/pyabi.h b/Include/pyabi.h new file mode 100644 index 00000000000000..8c4ae281a43faf --- /dev/null +++ b/Include/pyabi.h @@ -0,0 +1,121 @@ +/* Macros that restrict available definitions and select implementations + * to match an ABI stability promise: + * + * - internal API/ABI (may change at any time) -- Py_BUILD_CORE* + * - general CPython API/ABI (may change in 3.x.0) -- default + * - Stable ABI: abi3, abi3t (long-term stable) -- Py_LIMITED_API, + * Py_TARGET_ABI3T, _Py_OPAQUE_PYOBJECT + * - Free-threading (incompatible with non-free-threading builds) + * -- Py_GIL_DISABLED + */ + +#ifndef _Py_PYABI_H +#define _Py_PYABI_H + +/* Defines to build Python and its standard library: + * + * - Py_BUILD_CORE: Build Python core. Gives access to Python internals; should + * not be used by third-party modules. + * - Py_BUILD_CORE_BUILTIN: Build a Python stdlib module as a built-in module. + * - Py_BUILD_CORE_MODULE: Build a Python stdlib module as a dynamic library. + * + * Py_BUILD_CORE_BUILTIN and Py_BUILD_CORE_MODULE imply Py_BUILD_CORE. + * + * On Windows, Py_BUILD_CORE_MODULE exports "PyInit_xxx" symbol, whereas + * Py_BUILD_CORE_BUILTIN does not. + */ +#if defined(Py_BUILD_CORE_BUILTIN) && !defined(Py_BUILD_CORE) +# define Py_BUILD_CORE +#endif +#if defined(Py_BUILD_CORE_MODULE) && !defined(Py_BUILD_CORE) +# define Py_BUILD_CORE +#endif + +/* Check valid values for target ABI macros. + */ +#if defined(Py_LIMITED_API) && Py_LIMITED_API+0 < 3 + // Empty Py_LIMITED_API used to work; redefine to + // Python 3.2 to be explicit. +# undef Py_LIMITED_API +# define Py_LIMITED_API 0x03020000 +#endif +#if defined(Py_TARGET_ABI3T) && Py_TARGET_ABI3T+0 < 0x030f0000 +# error "Py_TARGET_ABI3T must be 0x030f0000 (3.15) or above" +#endif + +/* Stable ABI for free-threaded builds (abi3t, introduced in PEP 803) + * is enabled by one of: + * - Py_TARGET_ABI3T, or + * - Py_LIMITED_API and Py_GIL_DISABLED. + * + * These affect set the following, which Python.h should use internally: + * - Py_LIMITED_API (defines the subset of API we expose) + * - _Py_OPAQUE_PYOBJECT (additionally hides what's ABI-incompatible between + * free-threaded & GIL) + * + * (Don't use Py_TARGET_ABI3T directly. It's currently only used to set these + * 2 macros, and defined for users' convenience.) + */ +#if defined(Py_LIMITED_API) && defined(Py_GIL_DISABLED) \ + && !defined(Py_TARGET_ABI3T) +# define Py_TARGET_ABI3T Py_LIMITED_API +#endif +#if defined(Py_TARGET_ABI3T) +# define _Py_OPAQUE_PYOBJECT +# if !defined(Py_LIMITED_API) +# define Py_LIMITED_API Py_TARGET_ABI3T +# elif Py_LIMITED_API > Py_TARGET_ABI3T + // if both are defined, use the *lower* version, + // i.e. maximum compatibility +# undef Py_LIMITED_API +# define Py_LIMITED_API Py_TARGET_ABI3T +# endif +#else +# ifdef _Py_OPAQUE_PYOBJECT + // _Py_OPAQUE_PYOBJECT is a private macro; do not define it directly. +# error "Define Py_TARGET_ABI3T to target abi3t." +# endif +#endif + +#if defined(Py_TARGET_ABI3T) +# if !defined(Py_GIL_DISABLED) + // Define Py_GIL_DISABLED for users' needs. Users check this macro to see + // whether they need extra synchronization. +# define Py_GIL_DISABLED +# endif +# if defined(_Py_IS_TESTCEXT) + // When compiling for abi3t, contents of Python.h should not depend + // on Py_GIL_DISABLED. + // We ask GCC to error if it sees the macro from this point on. + // Since users are free to the macro, and there's no way to undo the + // poisoning at the end of Python.h, we only do this in a test module + // (test_cext). + // + // Clang's poisoning is stricter than GCC's: it looks in `#elif` + // expressions after matching `#if`s. We disable it for now. + // We also provide an undocumented, unsupported opt-out macro to help + // porting to other compilers. Consider reaching out if you use it. +# if defined(__GNUC__) && !defined(__clang__) && !defined(_Py_NO_GCC_POISON) +# undef Py_GIL_DISABLED +# pragma GCC poison Py_GIL_DISABLED +# endif +# endif +#endif + +/* The internal C API must not be used with the limited C API: make sure + * that Py_BUILD_CORE* macros are not defined in this case. + * But, keep the "original" values, under different names, for "exports.h" + */ +#ifdef Py_BUILD_CORE +# define _PyEXPORTS_CORE +#endif +#ifdef Py_BUILD_CORE_MODULE +# define _PyEXPORTS_CORE_MODULE +#endif +#ifdef Py_LIMITED_API +# undef Py_BUILD_CORE +# undef Py_BUILD_CORE_BUILTIN +# undef Py_BUILD_CORE_MODULE +#endif + +#endif // _Py_PYABI_H diff --git a/Include/pyport.h b/Include/pyport.h index 62cba4c1421f99..c975921beafb9e 100644 --- a/Include/pyport.h +++ b/Include/pyport.h @@ -58,34 +58,6 @@ #endif -/* Defines to build Python and its standard library: - * - * - Py_BUILD_CORE: Build Python core. Give access to Python internals, but - * should not be used by third-party modules. - * - Py_BUILD_CORE_BUILTIN: Build a Python stdlib module as a built-in module. - * - Py_BUILD_CORE_MODULE: Build a Python stdlib module as a dynamic library. - * - * Py_BUILD_CORE_BUILTIN and Py_BUILD_CORE_MODULE imply Py_BUILD_CORE. - * - * On Windows, Py_BUILD_CORE_MODULE exports "PyInit_xxx" symbol, whereas - * Py_BUILD_CORE_BUILTIN does not. - */ -#if defined(Py_BUILD_CORE_BUILTIN) && !defined(Py_BUILD_CORE) -# define Py_BUILD_CORE -#endif -#if defined(Py_BUILD_CORE_MODULE) && !defined(Py_BUILD_CORE) -# define Py_BUILD_CORE -#endif - -#if defined(Py_TARGET_ABI3T) -# if !defined(Py_GIL_DISABLED) -// Define Py_GIL_DISABLED for users' needs. This macro is used to enable -// locking needed in for free-threaded interpreters builds. -# define Py_GIL_DISABLED -# endif -#endif - - /************************************************************************** Symbols and macros to supply platform-independent interfaces to basic C language & library operations whose spellings vary across platforms. @@ -393,17 +365,6 @@ extern "C" { # define Py_NO_INLINE #endif -#include "exports.h" - -#ifdef Py_LIMITED_API - // The internal C API must not be used with the limited C API: make sure - // that Py_BUILD_CORE macro is not defined in this case. These 3 macros are - // used by exports.h, so only undefine them afterwards. -# undef Py_BUILD_CORE -# undef Py_BUILD_CORE_BUILTIN -# undef Py_BUILD_CORE_MODULE -#endif - /* limits.h constants that may be missing */ #ifndef INT_MAX diff --git a/Lib/test/test_cext/setup.py b/Lib/test/test_cext/setup.py index 7262a110d83415..25fe50df603883 100644 --- a/Lib/test/test_cext/setup.py +++ b/Lib/test/test_cext/setup.py @@ -18,6 +18,11 @@ # The purpose of test_cext extension is to check that building a C # extension using the Python C API does not emit C compiler warnings. '-Werror', + # Enable extra checks for header files, which: + # - need to be enabled somewhere inside Python headers (rather than + # before including Python.h) + # - should not be checked for user code + '-D_Py_IS_TESTCEXT', ] # C compiler flags for GCC and clang diff --git a/Makefile.pre.in b/Makefile.pre.in index f869c1f7c93776..57fce05d476e9e 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -1214,6 +1214,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/osdefs.h \ $(srcdir)/Include/osmodule.h \ $(srcdir)/Include/patchlevel.h \ + $(srcdir)/Include/pyabi.h \ $(srcdir)/Include/pyatomic.h \ $(srcdir)/Include/pybuffer.h \ $(srcdir)/Include/pycapsule.h \ diff --git a/Misc/NEWS.d/next/C_API/2026-04-09-14-45-44.gh-issue-148267.p84kG_.rst b/Misc/NEWS.d/next/C_API/2026-04-09-14-45-44.gh-issue-148267.p84kG_.rst new file mode 100644 index 00000000000000..1ec1afd2cbfeb9 --- /dev/null +++ b/Misc/NEWS.d/next/C_API/2026-04-09-14-45-44.gh-issue-148267.p84kG_.rst @@ -0,0 +1,2 @@ +Using :c:macro:`Py_LIMITED_API` on a non-Windows free-threaded build no +longer needs an extra :c:macro:`Py_GIL_DISABLED`. diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 61bee29c0af3d6..fe70e02536bbb6 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -359,6 +359,7 @@ + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 664788e69af19a..629f063861de9a 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -156,6 +156,9 @@ Include + + Include + Include From 29917d51ab41df9a00f1bb35fa9d3e1392ac48e8 Mon Sep 17 00:00:00 2001 From: Kumar Aditya Date: Thu, 23 Apr 2026 16:42:57 +0530 Subject: [PATCH 2/3] gh-148907: fix performance regression in `PyType_GetModuleByDef` on free-threading (#148908) --- Objects/typeobject.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/Objects/typeobject.c b/Objects/typeobject.c index 08b95cfbc6ce59..fb3c7101410683 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -5878,7 +5878,13 @@ PyType_GetModuleByToken_DuringGC(PyTypeObject *type, const void *token) PyObject * PyType_GetModuleByToken(PyTypeObject *type, const void *token) { - PyObject *mod = PyType_GetModuleByToken_DuringGC(type, token); + return Py_XNewRef(PyType_GetModuleByDef(type, (PyModuleDef *)token)); +} + +PyObject * +PyType_GetModuleByDef(PyTypeObject *type, PyModuleDef *def) +{ + PyObject *mod = PyType_GetModuleByToken_DuringGC(type, def); if (!mod) { PyErr_Format( PyExc_TypeError, @@ -5886,14 +5892,6 @@ PyType_GetModuleByToken(PyTypeObject *type, const void *token) type->tp_name); return NULL; } - return Py_NewRef(mod); -} - -PyObject * -PyType_GetModuleByDef(PyTypeObject *type, PyModuleDef *def) -{ - PyObject *mod = PyType_GetModuleByToken(type, def); - Py_XDECREF(mod); // return borrowed ref return mod; } From 9633c5239daae3a180f8ce263ce77e4e522e6aa4 Mon Sep 17 00:00:00 2001 From: Diego Russo Date: Thu, 23 Apr 2026 12:23:18 +0100 Subject: [PATCH 3/3] GH-126910: Build/link the JIT shim in the Python interpreter (#148872) --- Include/internal/pycore_ceval.h | 7 --- Include/internal/pycore_jit.h | 6 +- Makefile.pre.in | 26 ++++++-- PCbuild/pyproject.props | 5 +- PCbuild/pythoncore.vcxproj | 3 + PCbuild/regen.targets | 5 +- Python/ceval.c | 2 +- Python/jit.c | 96 ----------------------------- Python/pylifecycle.c | 8 --- Python/pystate.c | 5 -- Tools/jit/_targets.py | 106 ++++++++++++++++++++++---------- Tools/jit/_writer.py | 4 -- Tools/jit/build.py | 1 + Tools/jit/shim.c | 2 +- configure | 68 ++++++++++++-------- configure.ac | 67 ++++++++++++-------- 16 files changed, 202 insertions(+), 209 deletions(-) diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index ee8eb1095fe541..f9507fda1606db 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -121,18 +121,11 @@ _PyEval_EvalFrame(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwfl } #ifdef _Py_TIER2 -#ifdef _Py_JIT -_Py_CODEUNIT *_Py_LazyJitShim( - struct _PyExecutorObject *current_executor, _PyInterpreterFrame *frame, - _PyStackRef *stack_pointer, PyThreadState *tstate -); -#else _Py_CODEUNIT *_PyTier2Interpreter( struct _PyExecutorObject *current_executor, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate ); #endif -#endif extern _PyJitEntryFuncPtr _Py_jit_entry; diff --git a/Include/internal/pycore_jit.h b/Include/internal/pycore_jit.h index 70bccce4166c18..b3cadcce8247d0 100644 --- a/Include/internal/pycore_jit.h +++ b/Include/internal/pycore_jit.h @@ -23,9 +23,13 @@ typedef _Py_CODEUNIT *(*jit_func)( _PyStackRef _tos_cache0, _PyStackRef _tos_cache1, _PyStackRef _tos_cache2 ); +_Py_CODEUNIT *_PyJIT( + _PyExecutorObject *executor, _PyInterpreterFrame *frame, + _PyStackRef *stack_pointer, PyThreadState *tstate +); + int _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size_t length); void _PyJIT_Free(_PyExecutorObject *executor); -void _PyJIT_Fini(void); PyAPI_FUNC(int) _PyJIT_AddressInJitCode(PyInterpreterState *interp, uintptr_t addr); #endif // _Py_JIT diff --git a/Makefile.pre.in b/Makefile.pre.in index 57fce05d476e9e..8b46db33a2ac18 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -290,6 +290,7 @@ LDLIBRARYDIR= @LDLIBRARYDIR@ INSTSONAME= @INSTSONAME@ LIBRARY_DEPS= @LIBRARY_DEPS@ LINK_PYTHON_DEPS=@LINK_PYTHON_DEPS@ +JIT_OBJS= @JIT_SHIM_O@ PY_ENABLE_SHARED= @PY_ENABLE_SHARED@ STATIC_LIBPYTHON= @STATIC_LIBPYTHON@ @@ -469,6 +470,7 @@ PYTHON_OBJS= \ Python/instruction_sequence.o \ Python/intrinsics.o \ Python/jit.o \ + $(JIT_OBJS) \ Python/legacy_tracing.o \ Python/lock.o \ Python/marshal.o \ @@ -3204,21 +3206,37 @@ Python/emscripten_trampoline_inner.wasm: $(srcdir)/Python/emscripten_trampoline_ Python/emscripten_trampoline_wasm.c: Python/emscripten_trampoline_inner.wasm $(PYTHON_FOR_REGEN) $(srcdir)/Platforms/emscripten/prepare_external_wasm.py $< $@ getWasmTrampolineModule +JIT_SHIM_BUILD_OBJS= @JIT_SHIM_BUILD_O@ +JIT_BUILD_TARGETS= jit_stencils.h @JIT_STENCILS_H@ $(JIT_SHIM_BUILD_OBJS) +JIT_TARGETS= $(JIT_BUILD_TARGETS) $(filter-out $(JIT_SHIM_BUILD_OBJS),$(JIT_OBJS)) +JIT_GENERATED_STAMP= .jit-stamp + JIT_DEPS = \ $(srcdir)/Tools/jit/*.c \ + $(srcdir)/Tools/jit/*.h \ $(srcdir)/Tools/jit/*.py \ $(srcdir)/Python/executor_cases.c.h \ pyconfig.h -jit_stencils.h @JIT_STENCILS_H@: $(JIT_DEPS) +$(JIT_GENERATED_STAMP): $(JIT_DEPS) @REGEN_JIT_COMMAND@ + @touch $@ + +$(JIT_BUILD_TARGETS): $(JIT_GENERATED_STAMP) + @if test ! -f "$@"; then \ + rm -f $(JIT_GENERATED_STAMP); \ + $(MAKE) $(JIT_GENERATED_STAMP); \ + test -f "$@"; \ + fi + +jit_shim-universal2-apple-darwin.o: jit_shim-aarch64-apple-darwin.o jit_shim-x86_64-apple-darwin.o + lipo -create -output $@ jit_shim-aarch64-apple-darwin.o jit_shim-x86_64-apple-darwin.o Python/jit.o: $(srcdir)/Python/jit.c @JIT_STENCILS_H@ $(CC) -c $(PY_CORE_CFLAGS) -o $@ $< .PHONY: regen-jit -regen-jit: - @REGEN_JIT_COMMAND@ +regen-jit: $(JIT_TARGETS) # Some make's put the object file in the current directory .c.o: @@ -3342,7 +3360,7 @@ clean-profile: clean-retain-profile clean-bolt # gh-141808: The JIT stencils are deliberately kept in clean-profile .PHONY: clean-jit-stencils clean-jit-stencils: - -rm -f jit_stencils*.h + -rm -f $(JIT_TARGETS) $(JIT_GENERATED_STAMP) jit_stencils*.h jit_shim*.o .PHONY: clean clean: clean-profile clean-jit-stencils diff --git a/PCbuild/pyproject.props b/PCbuild/pyproject.props index 94ae718d58c4ba..f79608e1d58dbc 100644 --- a/PCbuild/pyproject.props +++ b/PCbuild/pyproject.props @@ -12,8 +12,9 @@ $(IntDir.Replace(`\\`, `\`)) $(Py_IntDir)\$(MajorVersionNumber)$(MinorVersionNumber)_frozen\ $(Py_IntDir)\$(MajorVersionNumber)$(MinorVersionNumber)$(ArchName)_$(Configuration)\zlib-ng\ - $(Py_IntDir)\$(MajorVersionNumber)$(MinorVersionNumber)_$(Configuration) - $(Py_IntDir)\$(MajorVersionNumber)$(MinorVersionNumber)_PGInstrument + $(Py_IntDir)\$(MajorVersionNumber)$(MinorVersionNumber)_$(Configuration)\ + $(Py_IntDir)\$(MajorVersionNumber)$(MinorVersionNumber)_PGInstrument\ + $(GeneratedJitStencilsDir.Replace(`\\`, `\`)) $(ProjectName) $(TargetName)$(PyDebugExt) false diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index fe70e02536bbb6..07305add81d055 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -115,6 +115,9 @@ version.lib;ws2_32.lib;pathcch.lib;bcrypt.lib;%(AdditionalDependencies) zlib-ng$(PyDebugExt).lib;%(AdditionalDependencies) + $(GeneratedJitStencilsDir)jit_shim-aarch64-pc-windows-msvc.o;%(AdditionalDependencies) + $(GeneratedJitStencilsDir)jit_shim-i686-pc-windows-msvc.o;%(AdditionalDependencies) + $(GeneratedJitStencilsDir)jit_shim-x86_64-pc-windows-msvc.o;%(AdditionalDependencies) diff --git a/PCbuild/regen.targets b/PCbuild/regen.targets index bb059f382eb375..9552e73ef6a2ec 100644 --- a/PCbuild/regen.targets +++ b/PCbuild/regen.targets @@ -35,6 +35,9 @@ <_JITOutputs Include="$(GeneratedJitStencilsDir)jit_stencils-aarch64-pc-windows-msvc.h" Condition="$(Platform) == 'ARM64'"/> <_JITOutputs Include="$(GeneratedJitStencilsDir)jit_stencils-i686-pc-windows-msvc.h" Condition="$(Platform) == 'Win32'"/> <_JITOutputs Include="$(GeneratedJitStencilsDir)jit_stencils-x86_64-pc-windows-msvc.h" Condition="$(Platform) == 'x64'"/> + <_JITOutputs Include="$(GeneratedJitStencilsDir)jit_shim-aarch64-pc-windows-msvc.o" Condition="$(Platform) == 'ARM64'"/> + <_JITOutputs Include="$(GeneratedJitStencilsDir)jit_shim-i686-pc-windows-msvc.o" Condition="$(Platform) == 'Win32'"/> + <_JITOutputs Include="$(GeneratedJitStencilsDir)jit_shim-x86_64-pc-windows-msvc.o" Condition="$(Platform) == 'x64'"/> <_CasesSources Include="$(PySourcePath)Python\bytecodes.c;$(PySourcePath)Python\optimizer_bytecodes.c;"/> <_CasesOutputs Include="$(PySourcePath)Python\generated_cases.c.h;$(PySourcePath)Include\opcode_ids.h;$(PySourcePath)Include\internal\pycore_uop_ids.h;$(PySourcePath)Python\opcode_targets.h;$(PySourcePath)Include\internal\pycore_opcode_metadata.h;$(PySourcePath)Include\internal\pycore_uop_metadata.h;$(PySourcePath)Python\optimizer_cases.c.h;$(PySourcePath)Lib\_opcode_metadata.py"/> <_SbomSources Include="$(PySourcePath)PCbuild\get_externals.bat" /> @@ -129,7 +132,7 @@ x86_64-pc-windows-msvc $(JITArgs) --debug - + diff --git a/Python/ceval.c b/Python/ceval.c index 967d92f4ea6855..506ea591c385c0 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1305,7 +1305,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } #ifdef _Py_TIER2 #ifdef _Py_JIT -_PyJitEntryFuncPtr _Py_jit_entry = _Py_LazyJitShim; +_PyJitEntryFuncPtr _Py_jit_entry = _PyJIT; #else _PyJitEntryFuncPtr _Py_jit_entry = _PyTier2Interpreter; #endif diff --git a/Python/jit.c b/Python/jit.c index af75acf1ff2bb3..26e01b25d48c04 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -60,8 +60,6 @@ jit_error(const char *message) PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint); } -static size_t _Py_jit_shim_size = 0; - static int address_in_executor_array(_PyExecutorObject **ptrs, size_t count, uintptr_t addr) { @@ -104,13 +102,6 @@ _PyJIT_AddressInJitCode(PyInterpreterState *interp, uintptr_t addr) if (interp == NULL) { return 0; } - if (_Py_jit_entry != _Py_LazyJitShim && _Py_jit_shim_size != 0) { - uintptr_t start = (uintptr_t)_Py_jit_entry; - uintptr_t end = start + _Py_jit_shim_size; - if (addr >= start && addr < end) { - return 1; - } - } if (address_in_executor_array(interp->executor_ptrs, interp->executor_count, addr)) { return 1; } @@ -727,75 +718,6 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz return 0; } -/* One-off compilation of the jit entry shim - * We compile this once only as it effectively a normal - * function, but we need to use the JIT because it needs - * to understand the jit-specific calling convention. - * Don't forget to call _PyJIT_Fini later! - */ -static _PyJitEntryFuncPtr -compile_shim(void) -{ - _PyExecutorObject dummy; - const StencilGroup *group; - size_t code_size = 0; - size_t data_size = 0; - jit_state state = {0}; - group = &shim; - code_size += group->code_size; - data_size += group->data_size; - combine_symbol_mask(group->trampoline_mask, state.trampolines.mask); - combine_symbol_mask(group->got_mask, state.got_symbols.mask); - // Round up to the nearest page: - size_t page_size = get_page_size(); - assert((page_size & (page_size - 1)) == 0); - size_t code_padding = DATA_ALIGN - ((code_size + state.trampolines.size) & (DATA_ALIGN - 1)); - size_t padding = page_size - ((code_size + state.trampolines.size + code_padding + data_size + state.got_symbols.size) & (page_size - 1)); - size_t total_size = code_size + state.trampolines.size + code_padding + data_size + state.got_symbols.size + padding; - unsigned char *memory = jit_alloc(total_size); - if (memory == NULL) { - return NULL; - } - unsigned char *code = memory; - state.trampolines.mem = memory + code_size; - unsigned char *data = memory + code_size + state.trampolines.size + code_padding; - state.got_symbols.mem = data + data_size; - // Compile the shim, which handles converting between the native - // calling convention and the calling convention used by jitted code - // (which may be different for efficiency reasons). - group = &shim; - group->emit(code, data, &dummy, NULL, &state); - code += group->code_size; - data += group->data_size; - assert(code == memory + code_size); - assert(data == memory + code_size + state.trampolines.size + code_padding + data_size); - if (mark_executable(memory, total_size)) { - jit_free(memory, total_size); - return NULL; - } - _Py_jit_shim_size = total_size; - return (_PyJitEntryFuncPtr)memory; -} - -static PyMutex lazy_jit_mutex = { 0 }; - -_Py_CODEUNIT * -_Py_LazyJitShim( - _PyExecutorObject *executor, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate -) { - PyMutex_Lock(&lazy_jit_mutex); - if (_Py_jit_entry == _Py_LazyJitShim) { - _PyJitEntryFuncPtr shim = compile_shim(); - if (shim == NULL) { - PyMutex_Unlock(&lazy_jit_mutex); - Py_FatalError("Cannot allocate core JIT code"); - } - _Py_jit_entry = shim; - } - PyMutex_Unlock(&lazy_jit_mutex); - return _Py_jit_entry(executor, frame, stack_pointer, tstate); -} - // Free executor's memory allocated with _PyJIT_Compile void _PyJIT_Free(_PyExecutorObject *executor) @@ -812,22 +734,4 @@ _PyJIT_Free(_PyExecutorObject *executor) } } -// Free shim memory allocated with compile_shim -void -_PyJIT_Fini(void) -{ - PyMutex_Lock(&lazy_jit_mutex); - unsigned char *memory = (unsigned char *)_Py_jit_entry; - size_t size = _Py_jit_shim_size; - if (size) { - _Py_jit_entry = _Py_LazyJitShim; - _Py_jit_shim_size = 0; - if (jit_free(memory, size)) { - PyErr_FormatUnraisable("Exception ignored while " - "freeing JIT entry code"); - } - } - PyMutex_Unlock(&lazy_jit_mutex); -} - #endif // _Py_JIT diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 0232ed6c382c61..0a88e32bb6b65e 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -37,9 +37,6 @@ #include "pycore_uniqueid.h" // _PyObject_FinalizeUniqueIdPool() #include "pycore_warnings.h" // _PyWarnings_InitState() #include "pycore_weakref.h" // _PyWeakref_GET_REF() -#ifdef _Py_JIT -#include "pycore_jit.h" // _PyJIT_Fini() -#endif #if defined(PYMALLOC_USE_HUGEPAGES) && defined(MS_WINDOWS) #include @@ -2531,11 +2528,6 @@ _Py_Finalize(_PyRuntimeState *runtime) finalize_interp_clear(tstate); -#ifdef _Py_JIT - /* Free JIT shim memory */ - _PyJIT_Fini(); -#endif - #ifdef Py_TRACE_REFS /* Display addresses (& refcnts) of all objects still alive. * An address can be used to find the repr of the object, printed diff --git a/Python/pystate.c b/Python/pystate.c index d6a26f3339b863..b7c838a1c156ae 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -489,11 +489,6 @@ free_interpreter(PyInterpreterState *interp) static inline int check_interpreter_whence(long); #endif -extern _Py_CODEUNIT * -_Py_LazyJitShim( - struct _PyExecutorObject *exec, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate -); - /* Get the interpreter state to a minimal consistent state. Further init happens in pylifecycle.c before it can be used. All fields not initialized here are expected to be zeroed out, diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index f78e80db165fc8..15cac3de3fe11f 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -57,6 +57,12 @@ class _Target(typing.Generic[_S, _R]): known_symbols: dict[str, int] = dataclasses.field(default_factory=dict) pyconfig_dir: pathlib.Path = pathlib.Path.cwd().resolve() + def _compile_args(self) -> list[str]: + return list(self.args) + + def _shim_compile_args(self) -> list[str]: + return [] + def _get_nop(self) -> bytes: if re.fullmatch(r"aarch64-.*", self.triple): nop = b"\x1f\x20\x03\xd5" @@ -139,12 +145,8 @@ def _handle_relocation( ) -> _stencils.Hole: raise NotImplementedError(type(self)) - async def _compile( - self, opname: str, c: pathlib.Path, tempdir: pathlib.Path - ) -> _stencils.StencilGroup: - s = tempdir / f"{opname}.s" - o = tempdir / f"{opname}.o" - args_s = [ + def _base_clang_args(self, opname: str, tempdir: pathlib.Path) -> list[str]: + return [ f"--target={self.triple}", "-DPy_BUILD_CORE_MODULE", "-D_DEBUG" if self.debug else "-DNDEBUG", @@ -167,29 +169,38 @@ async def _compile( # generates better code than -O2 (and -O2 usually generates better # code than -O3). As a nice benefit, it uses less memory too: "-Os", - "-S", # Shorten full absolute file paths in the generated code (like the # __FILE__ macro and assert failure messages) for reproducibility: f"-ffile-prefix-map={CPYTHON}=.", f"-ffile-prefix-map={tempdir}=.", - # This debug info isn't necessary, and bloats out the JIT'ed code. - # We *may* be able to re-enable this, process it, and JIT it for a - # nicer debugging experience... but that needs a lot more research: - "-fno-asynchronous-unwind-tables", # Don't call built-in functions that we can't find or patch: "-fno-builtin", # Don't call stack-smashing canaries that we can't find or patch: "-fno-stack-protector", "-std=c11", + ] + + async def _build_stencil_group( + self, opname: str, c: pathlib.Path, tempdir: pathlib.Path + ) -> _stencils.StencilGroup: + s = tempdir / f"{opname}.s" + o = tempdir / f"{opname}.o" + args_s = self._base_clang_args(opname, tempdir) + args_s += [ + "-S", + # Stencils do not need unwind info, and the optimizer does not + # preserve .cfi_* directives correctly. On Darwin, + # -fno-asynchronous-unwind-tables alone still leaves synchronous + # unwind directives in the assembly, so disable both forms here. + "-fno-unwind-tables", + "-fno-asynchronous-unwind-tables", "-o", f"{s}", f"{c}", ] - is_shim = opname == "shim" if self.frame_pointers: - frame_pointer = "all" if is_shim else "reserved" - args_s += ["-Xclang", f"-mframe-pointer={frame_pointer}"] - args_s += self.args + args_s += ["-Xclang", "-mframe-pointer=reserved"] + args_s += self._compile_args() # Allow user-provided CFLAGS to override any defaults args_s += shlex.split(self.cflags) await _llvm.run( @@ -199,14 +210,13 @@ async def _compile( llvm_version=self.llvm_version, llvm_tools_install_dir=self.llvm_tools_install_dir, ) - if not is_shim: - self.optimizer( - s, - label_prefix=self.label_prefix, - symbol_prefix=self.symbol_prefix, - re_global=self.re_global, - frame_pointers=self.frame_pointers, - ).run() + self.optimizer( + s, + label_prefix=self.label_prefix, + symbol_prefix=self.symbol_prefix, + re_global=self.re_global, + frame_pointers=self.frame_pointers, + ).run() args_o = [f"--target={self.triple}", "-c", "-o", f"{o}", f"{s}"] await _llvm.run( "clang", @@ -217,6 +227,30 @@ async def _compile( ) return await self._parse(o) + async def _build_shim_object(self, output: pathlib.Path) -> None: + with tempfile.TemporaryDirectory() as tempdir: + work = pathlib.Path(tempdir).resolve() + args_o = self._base_clang_args("shim", work) + args_o += self._shim_compile_args() + args_o += [ + "-c", + # The linked shim is a real function in the final binary, so + # keep unwind info for debuggers and stack walkers. + "-fasynchronous-unwind-tables", + ] + if self.frame_pointers: + args_o += ["-Xclang", "-mframe-pointer=all"] + args_o += self._compile_args() + args_o += shlex.split(self.cflags) + args_o += ["-o", f"{output}", f"{TOOLS_JIT / 'shim.c'}"] + await _llvm.run( + "clang", + args_o, + echo=self.verbose, + llvm_version=self.llvm_version, + llvm_tools_install_dir=self.llvm_tools_install_dir, + ) + async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]: generated_cases = PYTHON_EXECUTOR_CASES_C_H.read_text() cases_and_opnames = sorted( @@ -231,8 +265,6 @@ async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]: with tempfile.TemporaryDirectory() as tempdir: work = pathlib.Path(tempdir).resolve() async with asyncio.TaskGroup() as group: - coro = self._compile("shim", TOOLS_JIT / "shim.c", work) - tasks.append(group.create_task(coro, name="shim")) template = TOOLS_JIT_TEMPLATE_C.read_text() for case, opname in cases_and_opnames: # Write out a copy of the template with *only* this case @@ -242,7 +274,7 @@ async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]: # all of the other cases): c = work / f"{opname}.c" c.write_text(template.replace("CASE", case)) - coro = self._compile(opname, c, work) + coro = self._build_stencil_group(opname, c, work) tasks.append(group.create_task(coro, name=opname)) stencil_groups = {task.get_name(): task.result() for task in tasks} for stencil_group in stencil_groups.values(): @@ -256,8 +288,9 @@ def build( comment: str = "", force: bool = False, jit_stencils: pathlib.Path, + jit_shim_object: pathlib.Path, ) -> None: - """Build jit_stencils.h in the given directory.""" + """Build jit_stencils.h and the shim object in the given directory.""" jit_stencils.parent.mkdir(parents=True, exist_ok=True) if not self.stable: warning = f"JIT support for {self.triple} is still experimental!" @@ -271,8 +304,10 @@ def build( not force and jit_stencils.exists() and jit_stencils.read_text().startswith(digest) + and jit_shim_object.exists() ): return + ASYNCIO_RUNNER.run(self._build_shim_object(jit_shim_object)) stencil_groups = ASYNCIO_RUNNER.run(self._build_stencils()) jit_stencils_new = jit_stencils.parent / "jit_stencils.h.new" try: @@ -296,6 +331,13 @@ def build( class _COFF( _Target[_schema.COFFSection, _schema.COFFRelocation] ): # pylint: disable = too-few-public-methods + def _shim_compile_args(self) -> list[str]: + # The linked shim is part of pythoncore, not a shared extension. + # On Windows, Py_BUILD_CORE_MODULE makes public APIs import from + # pythonXY.lib, which creates a self-dependency when linking + # pythoncore.dll. Build the shim with builtin/core semantics. + return ["-UPy_BUILD_CORE_MODULE", "-DPy_BUILD_CORE_BUILTIN"] + def _handle_section( self, section: _schema.COFFSection, group: _stencils.StencilGroup ) -> None: @@ -396,6 +438,10 @@ class _COFF64(_COFF): symbol_prefix = "" re_global = re.compile(r'\s*\.def\s+(?P