Files
Pablo Galindo Salgado 65fbec64f6 [3.15] gh-151613: Fix remote debugging frame cache ABA (#152448)
gh-151613: Fix remote debugging frame cache ABA (#151614)

The remote debugging frame cache previously used only the last_profiled_frame address as its cache anchor. If a frame returned and a later frame reused the same _PyInterpreterFrame address, the profiler could accept a stale cache entry and splice parent frames from a different call chain into the current stack.

This adds a last_profiled_frame_seq counter next to last_profiled_frame, increments it when the anchor advances, stores it in frame cache entries, and validates cache hits against both the frame address and the sequence. Cache miss walks now copy stack chunks before storing new cache entries so stored continuations come from a stable snapshot. The new regression test exercises alternating call chains and checks that cached stacks never contain frames from both branches.

(cherry picked from commit 8cda6ae2f1)
2026-06-27 17:42:31 +00:00

367 lines
12 KiB
C

/******************************************************************************
* Remote Debugging Module - Frame Cache
*
* This file contains functions for caching frame information to optimize
* repeated stack unwinding for profiling.
******************************************************************************/
#include "_remote_debugging.h"
/* ============================================================================
* FRAME CACHE - stores (address, frame_info) pairs per thread
* Uses preallocated fixed-size arrays for efficiency and bounded memory.
* ============================================================================ */
int
frame_cache_init(RemoteUnwinderObject *unwinder)
{
unwinder->frame_cache = PyMem_Calloc(FRAME_CACHE_MAX_THREADS, sizeof(FrameCacheEntry));
if (!unwinder->frame_cache) {
PyErr_NoMemory();
return -1;
}
return 0;
}
void
frame_cache_cleanup(RemoteUnwinderObject *unwinder)
{
if (!unwinder->frame_cache) {
return;
}
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
Py_CLEAR(unwinder->frame_cache[i].frame_list);
}
PyMem_Free(unwinder->frame_cache);
unwinder->frame_cache = NULL;
}
// Find cache entry by thread_id
FrameCacheEntry *
frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id)
{
if (!unwinder->frame_cache || thread_id == 0) {
return NULL;
}
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
assert(i >= 0 && i < FRAME_CACHE_MAX_THREADS);
if (unwinder->frame_cache[i].thread_id == thread_id) {
assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES);
return &unwinder->frame_cache[i];
}
}
return NULL;
}
FrameCacheEntry *
frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr)
{
if (!unwinder->frame_cache || tstate_addr == 0) {
return NULL;
}
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
if (unwinder->frame_cache[i].thread_state_addr == tstate_addr) {
assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES);
return &unwinder->frame_cache[i];
}
}
return NULL;
}
// Allocate a cache slot for a thread
// Returns NULL if cache is full (graceful degradation)
static FrameCacheEntry *
frame_cache_alloc_slot(RemoteUnwinderObject *unwinder, uint64_t thread_id)
{
if (!unwinder->frame_cache || thread_id == 0) {
return NULL;
}
// First check if thread already has an entry
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
if (unwinder->frame_cache[i].thread_id == thread_id) {
return &unwinder->frame_cache[i];
}
}
// Find empty slot
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
if (unwinder->frame_cache[i].thread_id == 0) {
return &unwinder->frame_cache[i];
}
}
// Cache full - graceful degradation
return NULL;
}
// Remove cache entries for threads not seen in the result
// result structure: list of InterpreterInfo, where InterpreterInfo[1] is threads list,
// and ThreadInfo[0] is the thread_id
void
frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
{
if (!unwinder->frame_cache || !result || !PyList_Check(result)) {
return;
}
// Build array of seen thread IDs from result
uint64_t seen_threads[FRAME_CACHE_MAX_THREADS];
int num_seen = 0;
Py_ssize_t num_interps = PyList_GET_SIZE(result);
for (Py_ssize_t i = 0; i < num_interps && num_seen < FRAME_CACHE_MAX_THREADS; i++) {
PyObject *interp_info = PyList_GET_ITEM(result, i);
PyObject *threads = PyStructSequence_GetItem(interp_info, 1);
if (!threads || !PyList_Check(threads)) {
continue;
}
Py_ssize_t num_threads = PyList_GET_SIZE(threads);
for (Py_ssize_t j = 0; j < num_threads && num_seen < FRAME_CACHE_MAX_THREADS; j++) {
PyObject *thread_info = PyList_GET_ITEM(threads, j);
PyObject *tid_obj = PyStructSequence_GetItem(thread_info, 0);
if (tid_obj) {
uint64_t tid = PyLong_AsUnsignedLongLong(tid_obj);
if (!PyErr_Occurred()) {
seen_threads[num_seen++] = tid;
} else {
PyErr_Clear();
}
}
}
}
// Invalidate entries not in seen list
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
if (unwinder->frame_cache[i].thread_id == 0) {
continue;
}
int found = 0;
for (int j = 0; j < num_seen; j++) {
if (unwinder->frame_cache[i].thread_id == seen_threads[j]) {
found = 1;
break;
}
}
if (!found) {
// Clear this entry
Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
Py_CLEAR(unwinder->frame_cache[i].frame_list);
unwinder->frame_cache[i].thread_id = 0;
unwinder->frame_cache[i].thread_state_addr = 0;
unwinder->frame_cache[i].last_profiled_frame_seq = 0;
unwinder->frame_cache[i].num_addrs = 0;
STATS_INC(unwinder, stale_cache_invalidations);
}
}
}
static int
read_last_profiled_anchor(RemoteUnwinderObject *unwinder,
uintptr_t thread_state_addr,
FrameCacheAnchor *anchor)
{
uintptr_t frame_offset = (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame;
uintptr_t seq_offset = (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame_seq;
// These fields are adjacent in PyThreadState. Read them together when the
// layout allows it so validation uses a pointer and sequence from the same
// remote-memory read.
if (seq_offset == frame_offset + sizeof(uintptr_t)) {
uintptr_t live_anchor[2];
if (_Py_RemoteDebug_ReadRemoteMemory(&unwinder->handle,
thread_state_addr + frame_offset,
sizeof(live_anchor),
live_anchor) < 0) {
return -1;
}
anchor->frame = live_anchor[0];
anchor->seq = live_anchor[1];
return 0;
}
if (read_ptr(unwinder, thread_state_addr + frame_offset, &anchor->frame) < 0) {
return -1;
}
return read_ptr(unwinder, thread_state_addr + seq_offset, &anchor->seq);
}
static Py_ssize_t
find_cached_frame_addr(const FrameCacheEntry *entry, uintptr_t frame_addr,
uintptr_t *real_pops)
{
*real_pops = 0;
for (Py_ssize_t i = 0; i < entry->num_addrs; i++) {
if (entry->addrs[i] == frame_addr) {
return i;
}
if (entry->addrs[i] != 0) {
(*real_pops)++;
}
}
return -1;
}
int
frame_cache_anchor_matches(
RemoteUnwinderObject *unwinder,
uintptr_t thread_state_addr,
FrameCacheAnchor anchor)
{
FrameCacheAnchor live_anchor = {0, 0};
if (read_last_profiled_anchor(unwinder, thread_state_addr, &live_anchor) < 0) {
PyErr_Clear();
return 0;
}
return live_anchor.frame == anchor.frame && live_anchor.seq == anchor.seq;
}
// Find last_profiled_frame in cache and extend frame_info with cached continuation
// If frame_addrs is provided (not NULL), also extends it with cached addresses
int
frame_cache_lookup_and_extend(
RemoteUnwinderObject *unwinder,
uint64_t thread_id,
uintptr_t thread_state_addr,
FrameCacheAnchor anchor,
PyObject *frame_info,
uintptr_t *frame_addrs,
Py_ssize_t *num_addrs,
Py_ssize_t max_addrs)
{
if (!unwinder->frame_cache || anchor.frame == 0) {
return 0;
}
FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id);
if (!entry || !entry->frame_list) {
return 0;
}
if (entry->thread_state_addr != thread_state_addr) {
return 0;
}
assert(entry->num_addrs >= 0 && entry->num_addrs <= FRAME_CACHE_MAX_FRAMES);
uintptr_t real_pops = 0;
Py_ssize_t start_idx = find_cached_frame_addr(entry, anchor.frame, &real_pops);
if (start_idx < 0) {
return 0; // Not found
}
assert(start_idx < entry->num_addrs);
// Synthetic marker frames (<native>/<GC>) are stored as addr-0 entries but
// never increment last_profiled_frame_seq in the target (only real frame
// pops do). Count the real frames before start_idx so the sequence check is
// not thrown off by markers sitting between the leaf and the anchor.
if (entry->last_profiled_frame_seq + real_pops != anchor.seq) {
return 0;
}
Py_ssize_t num_frames = PyList_GET_SIZE(entry->frame_list);
if (start_idx >= num_frames) {
return 0;
}
// Extend frame_info with frames ABOVE start_idx (not including it).
// The frame at start_idx (last_profiled_frame) was the executing frame
// in the previous sample and its line number may have changed.
// Only frames above it (its callers) are frozen at their call sites.
Py_ssize_t cache_start = start_idx + 1;
if (cache_start >= num_frames) {
return 0; // Nothing above last_profiled_frame to extend with
}
if (!frame_cache_anchor_matches(unwinder, thread_state_addr, anchor)) {
return 0;
}
PyObject *slice = PyList_GetSlice(entry->frame_list, cache_start, num_frames);
if (!slice) {
return -1;
}
Py_ssize_t cur_size = PyList_GET_SIZE(frame_info);
int result = PyList_SetSlice(frame_info, cur_size, cur_size, slice);
Py_DECREF(slice);
if (result < 0) {
return -1;
}
// Also extend frame_addrs with cached addresses (above last_profiled_frame)
if (frame_addrs) {
for (Py_ssize_t i = cache_start; i < entry->num_addrs && *num_addrs < max_addrs; i++) {
frame_addrs[(*num_addrs)++] = entry->addrs[i];
}
}
return 1;
}
// Store frame list with addresses in cache
// Only stores complete stacks that reach base_frame_addr (validation done internally)
// Returns: 1 = stored successfully, 0 = not stored (graceful degradation), -1 = error
int
frame_cache_store(
RemoteUnwinderObject *unwinder,
uint64_t thread_id,
PyObject *frame_list,
const uintptr_t *addrs,
Py_ssize_t num_addrs,
uintptr_t thread_state_addr,
uintptr_t last_profiled_frame_seq,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited)
{
if (!unwinder->frame_cache || thread_id == 0) {
return 0;
}
// Validate we have a complete stack before caching.
// Only cache if last_frame_visited matches base_frame_addr (the sentinel
// at the bottom of the stack). Note: we use last_frame_visited rather than
// addrs[num_addrs-1] because the base frame is visited but not added to the
// addrs array (it returns frame==NULL from is_frame_valid due to
// owner==FRAME_OWNED_BY_INTERPRETER).
if (base_frame_addr != 0 && last_frame_visited != base_frame_addr) {
// Incomplete stack - don't cache (graceful degradation)
return 0;
}
// Clamp to max frames
if (num_addrs > FRAME_CACHE_MAX_FRAMES) {
num_addrs = FRAME_CACHE_MAX_FRAMES;
}
assert(num_addrs >= 0 && num_addrs <= FRAME_CACHE_MAX_FRAMES);
FrameCacheEntry *entry = frame_cache_alloc_slot(unwinder, thread_id);
if (!entry) {
// Cache full - graceful degradation
return 0;
}
// Clear old frame_list if replacing
Py_CLEAR(entry->frame_list);
// Store full frame list (don't truncate to num_addrs - frames beyond the
// address array limit are still valid and needed for full cache hits)
Py_ssize_t num_frames = PyList_GET_SIZE(frame_list);
entry->frame_list = PyList_GetSlice(frame_list, 0, num_frames);
if (!entry->frame_list) {
return -1;
}
entry->thread_id = thread_id;
entry->thread_state_addr = thread_state_addr;
entry->last_profiled_frame_seq = last_profiled_frame_seq;
if (entry->thread_id_obj == NULL) {
entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id);
if (entry->thread_id_obj == NULL) {
return -1;
}
}
memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
entry->num_addrs = num_addrs;
assert(entry->num_addrs == num_addrs);
assert(entry->thread_id == thread_id);
return 1;
}