Skip to content

Commit 494f2e3

Browse files
authored
gh-150723: Fix perf jitdump files on macOS (#150728)
The perf jitdump format defines the thread id field of the JR_CODE_LOAD record as a 32-bit value, but on macOS it was declared as a uint64_t (since pthread_threadid_np() returns a uint64_t). Those extra 8 bytes plus alignment padding shifted every following field, so parsers reading the file by the spec misread code_size as the code address and failed to resolve any Python frames. Declare thread_id as uint32_t on all platforms and truncate the macOS thread id when writing the record. The value is only informational. Symbols are resolved by address, and not thread ids so truncation is safe here. * Use mach_absolute_time for macOS jitdump timestamps On macOS the jitdump file is consumed by profilers such as samply, which timestamp their samples using mach_absolute_time(). The jitdump events were stamped with clock_gettime(CLOCK_MONOTONIC), a different clock domain that keeps advancing while the system is asleep, so the JIT code mappings could be off by days relative to the samples and no Python frame would resolve. Stamp jitdump events with mach_absolute_time() on macOS so they share the sampler's clock domain. Linux continues to use CLOCK_MONOTONIC to stay aligned with perf. Exercise the -Xperf_jit (jitdump) backend through samply and assert that Python frames resolve, exercising the binary jitdump path end to end. Skipped when samply is not installed.
1 parent 29805f0 commit 494f2e3

4 files changed

Lines changed: 56 additions & 5 deletions

File tree

Lib/test/test_samply_profiler.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,5 +240,29 @@ def compile_trampolines_for_all_functions():
240240
self.assertIn(line, child_perf_file_contents)
241241

242242

243+
@unittest.skipUnless(samply_command_works(), "samply command doesn't work")
244+
class TestSamplyProfilerWithJitDump(unittest.TestCase, TestSamplyProfilerMixin):
245+
# Regression test for gh-150723: exercises the binary jitdump backend
246+
# (-Xperf_jit) end to end through samply, unlike TestSamplyProfiler which
247+
# uses the textual perf-map backend (-Xperf).
248+
def run_samply(self, script_dir, script, activate_trampoline=True):
249+
if activate_trampoline:
250+
return run_samply(script_dir, sys.executable, "-Xperf_jit", script)
251+
return run_samply(script_dir, sys.executable, script)
252+
253+
def setUp(self):
254+
super().setUp()
255+
self.jit_files = set(pathlib.Path("/tmp/").glob("jit-*.dump"))
256+
self.jit_files |= set(pathlib.Path("/tmp/").glob("jitted-*.so"))
257+
258+
def tearDown(self) -> None:
259+
super().tearDown()
260+
files_to_delete = set(pathlib.Path("/tmp/").glob("jit-*.dump"))
261+
files_to_delete |= set(pathlib.Path("/tmp/").glob("jitted-*.so"))
262+
files_to_delete -= self.jit_files
263+
for file in files_to_delete:
264+
file.unlink()
265+
266+
243267
if __name__ == "__main__":
244268
unittest.main()
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix malformed perf jitdump thread ids on macOS. The ``thread_id`` field of the
2+
``JR_CODE_LOAD`` record was written as a 64-bit value instead of the 32-bit
3+
value required by the jitdump format, which shifted every following field and
4+
prevented profilers from resolving Python frames.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix perf jitdump timestamps on macOS. Events were stamped using
2+
``CLOCK_MONOTONIC``, but macOS profilers timestamp their samples with
3+
``mach_absolute_time()``. The mismatch prevented the JIT code mappings from
4+
lining up with the samples, so no Python frame could be resolved.

Python/perf_jit_trampoline.c

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,9 @@
8282
#if defined(__linux__)
8383
# include <sys/syscall.h> // System call interface
8484
#endif
85+
#if defined(__APPLE__)
86+
# include <mach/mach_time.h> // mach_absolute_time, mach_timebase_info
87+
#endif
8588

8689
// =============================================================================
8790
// CONSTANTS AND CONFIGURATION
@@ -217,11 +220,7 @@ struct BaseEvent {
217220
typedef struct {
218221
struct BaseEvent base; // Common event header
219222
uint32_t process_id; // Process ID where code was generated
220-
#if defined(__APPLE__)
221-
uint64_t thread_id; // Thread ID where code was generated
222-
#else
223223
uint32_t thread_id; // Thread ID where code was generated
224-
#endif
225224
uint64_t vma; // Virtual memory address where code is loaded
226225
uint64_t code_address; // Address of the actual machine code
227226
uint64_t code_size; // Size of the machine code in bytes
@@ -295,7 +294,9 @@ static PerfMapJitState perf_jit_map_state;
295294
// =============================================================================
296295

297296
/* Time conversion constant */
297+
#if !defined(__APPLE__)
298298
static const intptr_t nanoseconds_per_second = 1000000000;
299+
#endif
299300

300301
/*
301302
* Get current monotonic time in nanoseconds
@@ -307,6 +308,18 @@ static const intptr_t nanoseconds_per_second = 1000000000;
307308
* Returns: Current monotonic time in nanoseconds since an arbitrary epoch
308309
*/
309310
static int64_t get_current_monotonic_ticks(void) {
311+
#if defined(__APPLE__)
312+
// On macOS the jitdump file is consumed by profilers (such as samply) that
313+
// timestamp their samples using mach_absolute_time(). The jitdump event
314+
// timestamps must use the same clock domain, otherwise the JIT code
315+
// mappings cannot be lined up with the samples.
316+
static mach_timebase_info_data_t timebase = {0, 0};
317+
if (timebase.denom == 0) {
318+
(void)mach_timebase_info(&timebase);
319+
}
320+
uint64_t ticks = mach_absolute_time();
321+
return (int64_t)(ticks * timebase.numer / timebase.denom);
322+
#else
310323
struct timespec ts;
311324
if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
312325
Py_UNREACHABLE(); // Should never fail on supported systems
@@ -318,6 +331,7 @@ static int64_t get_current_monotonic_ticks(void) {
318331
result *= nanoseconds_per_second;
319332
result += ts.tv_nsec;
320333
return result;
334+
#endif
321335
}
322336

323337
/*
@@ -652,7 +666,12 @@ static void perf_map_jit_write_entry_with_name(
652666
ev.base.time_stamp = get_current_monotonic_ticks();
653667
ev.process_id = getpid();
654668
#if defined(__APPLE__)
655-
pthread_threadid_np(NULL, &ev.thread_id);
669+
// The jitdump format defines the thread id field as a 32-bit value, but
670+
// pthread_threadid_np() returns a 64-bit id. Truncate it to 32 bits to
671+
// keep the record layout identical to other platforms.
672+
uint64_t thread_id = 0;
673+
pthread_threadid_np(NULL, &thread_id);
674+
ev.thread_id = (uint32_t)thread_id;
656675
#else
657676
ev.thread_id = syscall(SYS_gettid); // Get thread ID via system call
658677
#endif

0 commit comments

Comments
 (0)