Skip to content

Commit 8a872ae

Browse files
committed
Fix ARM64 stack corruption in ghost_stack by skipping internal frames
On ARM64, the first frame(s) returned by libunwind's cursor iteration could have invalid ret_loc values pointing into our own active stack frame. Writing the trampoline address to these locations corrupted our execution state, causing crashes. Root cause: On ARM64, unw_get_save_loc() for the link register (X30) returns the location where LR was saved by the function prologue. For our internal frames (capture_and_install, ghost_stack_backtrace), these locations were still being used during execution. Additionally, fixed the expected_sp calculation for ARM64 longjmp detection. On x86_64, RET pops the return address so SP = ret_loc + 8. On ARM64, RET doesn't touch SP - the epilogue restores it beforehand. The trampoline receives the actual SP value, not ret_loc + 8.
1 parent 09b458b commit 8a872ae

File tree

2 files changed

+43
-48
lines changed

2 files changed

+43
-48
lines changed

src/memray/_memray/ghost_stack/src/ghost_stack.cpp

Lines changed: 42 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -52,13 +52,14 @@ extern "C" void ghost_ret_trampoline();
5252
// Logging (minimal, stderr only)
5353
// ============================================================================
5454

55+
// #define DEBUG
5556
#ifdef DEBUG
56-
#define LOG_DEBUG(...) fprintf(stderr, "[GhostStack] " __VA_ARGS__)
57+
#define LOG_DEBUG(...) do { fprintf(stderr, "[GS] " __VA_ARGS__); fflush(stderr); } while(0)
5758
#else
5859
#define LOG_DEBUG(...) ((void)0)
5960
#endif
6061

61-
#define LOG_ERROR(...) fprintf(stderr, "[GhostStack][ERROR] " __VA_ARGS__)
62+
#define LOG_ERROR(...) do { fprintf(stderr, "[GS][ERR] " __VA_ARGS__); fflush(stderr); } while(0)
6263

6364
// ============================================================================
6465
// Utilities
@@ -179,17 +180,6 @@ class GhostStackImpl {
179180

180181
/**
181182
* Called by trampoline when a function returns.
182-
*
183-
* Uses epoch-based validation to detect if reset() was called during
184-
* execution (e.g., from a signal handler). This prevents accessing
185-
* stale or cleared entries.
186-
*
187-
* Implements longjmp detection by comparing the current stack pointer
188-
* against the expected value. If they don't match, searches backward
189-
* through the shadow stack to find the matching entry (like nwind does).
190-
*
191-
* @param sp Stack pointer at return time (for longjmp detection)
192-
* @return Original return address to jump to
193183
*/
194184
uintptr_t on_ret_trampoline(uintptr_t sp) {
195185
// Capture current epoch - if it changes, reset() was called
@@ -199,7 +189,8 @@ class GhostStackImpl {
199189
size_t tail = tail_.fetch_sub(1, std::memory_order_acq_rel) - 1;
200190

201191
if (entries_.empty() || tail >= entries_.size()) {
202-
LOG_ERROR("Stack corruption in trampoline!\n");
192+
LOG_ERROR("CORRUPTION! empty=%d tail=%zu sz=%zu\n",
193+
(int)entries_.empty(), tail, entries_.size());
203194
std::abort();
204195
}
205196

@@ -208,17 +199,10 @@ class GhostStackImpl {
208199
// Check for longjmp: if SP doesn't match expected, search backward
209200
// through shadow stack for matching entry (frames were skipped)
210201
if (sp != 0 && entry.stack_pointer != 0 && entry.stack_pointer != sp) {
211-
LOG_DEBUG("SP mismatch at index %zu: expected 0x%lx, got 0x%lx - checking for longjmp\n",
212-
tail, entry.stack_pointer, sp);
213-
214202
// Search backward through shadow stack for matching SP (nwind style)
215203
// Only update tail_ if we find a match - don't corrupt it during search
216204
for (size_t i = tail; i > 0; --i) {
217205
if (entries_[i - 1].stack_pointer == sp) {
218-
size_t skipped = tail - (i - 1);
219-
LOG_DEBUG("longjmp detected: found matching SP at index %zu (skipped %zu frames)\n",
220-
i - 1, skipped);
221-
222206
// Update tail_ to skip all the frames that were bypassed by longjmp
223207
tail_.store(i - 1, std::memory_order_release);
224208
tail = i - 1;
@@ -234,8 +218,7 @@ class GhostStackImpl {
234218
std::abort();
235219
}
236220

237-
uintptr_t ret_addr = entries_[tail].return_address;
238-
return ret_addr;
221+
return entries_[tail].return_address;
239222
}
240223

241224
private:
@@ -254,7 +237,6 @@ class GhostStackImpl {
254237
buffer[i] = reinterpret_cast<void*>(entries_[count - 1 - i].ip);
255238
}
256239

257-
LOG_DEBUG("Fast path: %zu frames\n", count);
258240
return count;
259241
}
260242

@@ -278,9 +260,12 @@ class GhostStackImpl {
278260
unw_getcontext(&ctx);
279261
unw_init_local(&cursor, &ctx);
280262

263+
// Skip the current frame to avoid patching our own return address
264+
if (unw_step(&cursor) > 0) {
265+
// Skipped internal frame
266+
}
267+
281268
// Process frames: read current frame, then step to next
282-
// Note: After skip loop, cursor is positioned AT the first frame we want
283-
// We need to read first, then step (not step-then-read)
284269
size_t frame_idx = 0;
285270
int step_result;
286271
do {
@@ -309,17 +294,31 @@ class GhostStackImpl {
309294

310295
// Get location where return address is stored
311296
uintptr_t* ret_loc = nullptr;
297+
298+
// Get actual SP (needed for ARM64 expected_sp calculation)
299+
unw_word_t actual_sp;
300+
unw_get_reg(&cursor, UNW_REG_SP, &actual_sp);
301+
312302
#ifdef __linux__
313303
unw_save_loc_t loc;
314-
if (unw_get_save_loc(&cursor, GS_RA_REGISTER, &loc) == 0 &&
315-
loc.type == UNW_SLT_MEMORY) {
304+
int save_loc_ret = unw_get_save_loc(&cursor, GS_RA_REGISTER, &loc);
305+
306+
if (save_loc_ret == 0 && loc.type == UNW_SLT_MEMORY && loc.u.addr != 0) {
316307
ret_loc = reinterpret_cast<uintptr_t*>(loc.u.addr);
308+
// Sanity check: ret_loc should be somewhere near FP (which is our sp variable)
309+
uintptr_t addr = loc.u.addr;
310+
if (addr < sp - 0x10000 || addr > sp + 0x10000) {
311+
ret_loc = nullptr; // Don't use this suspicious address
312+
}
317313
}
318314
#else
319315
// macOS: return address is at fp + sizeof(void*)
320316
ret_loc = reinterpret_cast<uintptr_t*>(sp + sizeof(void*));
321317
#endif
322-
if (!ret_loc) break;
318+
319+
if (!ret_loc) {
320+
break;
321+
}
323322

324323
uintptr_t ret_addr = *ret_loc;
325324

@@ -332,16 +331,22 @@ class GhostStackImpl {
332331
// Compare against stripped address since trampoline address doesn't have PAC
333332
if (stripped_ret_addr == reinterpret_cast<uintptr_t>(ghost_ret_trampoline)) {
334333
found_existing = true;
335-
LOG_DEBUG("Found existing trampoline at frame %zu\n", frame_idx);
336334
break;
337335
}
338336

339337
// Store the stack pointer that the trampoline will pass.
340-
// The trampoline passes RSP right after landing (before its stack manipulations).
341-
// When RET executes, it pops the return address, so:
342-
// RSP_trampoline = ret_loc + sizeof(void*)
343338
// This allows longjmp detection by comparing against the stored value.
339+
//
340+
// On x86_64: RET pops return address, so trampoline sees ret_loc + 8
341+
// On ARM64: RET doesn't touch SP. The trampoline receives the actual SP
342+
// at the moment of return (after the function's epilogue ran).
343+
// This is the value from UNW_REG_SP, not the FP (UNW_AARCH64_X29).
344+
#ifdef GS_ARCH_AARCH64
345+
uintptr_t expected_sp = actual_sp; // Actual SP at this frame
346+
#else
344347
uintptr_t expected_sp = reinterpret_cast<uintptr_t>(ret_loc) + sizeof(void*);
348+
#endif
349+
345350
// Store both IP (for returning to caller) and return_address (for trampoline restoration)
346351
// Insert at beginning to reverse order (oldest at index 0, newest at end)
347352
new_entries.insert(new_entries.begin(), {ip, ret_addr, ret_loc, expected_sp});
@@ -351,8 +356,10 @@ class GhostStackImpl {
351356
} while (step_result > 0);
352357

353358
// Install trampolines on new entries
354-
for (auto& e : new_entries) {
355-
*e.location = reinterpret_cast<uintptr_t>(ghost_ret_trampoline);
359+
uintptr_t tramp_addr = reinterpret_cast<uintptr_t>(ghost_ret_trampoline);
360+
for (size_t i = 0; i < new_entries.size(); ++i) {
361+
auto& e = new_entries[i];
362+
*e.location = tramp_addr;
356363
}
357364

358365
// Merge with existing entries if we found a patched frame
@@ -375,7 +382,6 @@ class GhostStackImpl {
375382
buffer[i] = reinterpret_cast<void*>(entries_[count - 1 - i].ip);
376383
}
377384

378-
LOG_DEBUG("Captured %zu frames\n", count);
379385
return count;
380386
}
381387

@@ -431,7 +437,6 @@ struct ThreadLocalInstance {
431437

432438
~ThreadLocalInstance() {
433439
if (ptr) {
434-
LOG_DEBUG("Thread exit: resetting shadow stack\n");
435440
ptr->reset();
436441
delete ptr;
437442
ptr = nullptr;
@@ -444,7 +449,6 @@ static thread_local ThreadLocalInstance t_instance;
444449
static GhostStackImpl& get_instance() {
445450
if (!t_instance.ptr) {
446451
t_instance.ptr = new GhostStackImpl();
447-
LOG_DEBUG("Created new shadow stack instance for thread\n");
448452
}
449453
return *t_instance.ptr;
450454
}
@@ -473,13 +477,11 @@ static void fork_child_handler() {
473477
if (t_instance.ptr) {
474478
t_instance.ptr->reset();
475479
}
476-
LOG_DEBUG("Fork child handler: reset shadow stack\n");
477480
}
478481

479482
static void register_atfork_handler() {
480483
std::call_once(g_atfork_flag, []() {
481484
pthread_atfork(nullptr, nullptr, fork_child_handler);
482-
LOG_DEBUG("Registered pthread_atfork handler\n");
483485
});
484486
}
485487

@@ -492,8 +494,6 @@ extern "C" {
492494
void ghost_stack_init(ghost_stack_unwinder_t unwinder) {
493495
std::call_once(g_init_flag, [unwinder]() {
494496
g_custom_unwinder = unwinder;
495-
LOG_DEBUG("Initialized with %s unwinder\n",
496-
unwinder ? "custom" : "default");
497497
});
498498

499499
// Register fork handler (idempotent, safe to call multiple times)
@@ -542,8 +542,6 @@ uintptr_t ghost_trampoline_handler(uintptr_t sp) {
542542

543543
// Called when exception passes through trampoline
544544
uintptr_t ghost_exception_handler(void* exception) {
545-
LOG_DEBUG("Exception through trampoline\n");
546-
547545
auto& impl = get_instance();
548546
uintptr_t ret = impl.pop_entry(); // Direct pop, no longjmp check
549547
impl.reset();

src/memray/_memray/tracking_api.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,9 @@
2727

2828
#ifdef MEMRAY_HAS_GHOST_STACK
2929
# include "ghost_stack.h"
30-
#if defined(__linux__)
31-
# define GHOST_STACK_SKIP_FRAMES 2
32-
#elif defined(__APPLE__)
30+
// ghost_stack skips 1 internal frame, we skip 1 more for our tracking frame
3331
# define GHOST_STACK_SKIP_FRAMES 1
3432
#endif
35-
#endif
3633

3734
#include "frame_tree.h"
3835
#include "hooks.h"

0 commit comments

Comments
 (0)