Skip to content

Commit 5510d0f

Browse files
committed
Update ghost unwind
1 parent 1588be9 commit 5510d0f

File tree

1 file changed

+192
-30
lines changed

1 file changed

+192
-30
lines changed

src/memray/_memray/ghost_stack/src/ghost_stack.cpp

Lines changed: 192 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -52,13 +52,15 @@ extern "C" void ghost_ret_trampoline();
5252
// Logging (minimal, stderr only)
5353
// ============================================================================
5454

55-
#ifdef DEBUG
56-
#define LOG_DEBUG(...) fprintf(stderr, "[GhostStack] " __VA_ARGS__)
55+
// GS_FORCE_DEBUG can be defined via compiler flag (-DGS_FORCE_DEBUG) for test builds
56+
#if defined(DEBUG) || defined(GS_FORCE_DEBUG)
57+
#define LOG_DEBUG(...) do { fprintf(stderr, "[GhostStack][DEBUG] " __VA_ARGS__); fflush(stderr); } while(0)
5758
#else
5859
#define LOG_DEBUG(...) ((void)0)
5960
#endif
6061

61-
#define LOG_ERROR(...) fprintf(stderr, "[GhostStack][ERROR] " __VA_ARGS__)
62+
#define LOG_ERROR(...) do { fprintf(stderr, "[GhostStack][ERROR] " __VA_ARGS__); fflush(stderr); } while(0)
63+
#define LOG_INFO(...) do { fprintf(stderr, "[GhostStack][INFO] " __VA_ARGS__); fflush(stderr); } while(0)
6264

6365
// ============================================================================
6466
// Utilities
@@ -110,7 +112,14 @@ class GhostStackImpl {
110112

111113
// Main capture function - returns number of frames
112114
size_t backtrace(void** buffer, size_t max_frames) {
115+
LOG_DEBUG("=== backtrace ENTER ===\n");
116+
LOG_DEBUG(" this=%p, buffer=%p, max_frames=%zu\n", (void*)this, (void*)buffer, max_frames);
117+
LOG_DEBUG(" is_capturing_=%d, trampolines_installed_=%d, entries_.size()=%zu, tail_=%zu\n",
118+
(int)is_capturing_, (int)trampolines_installed_, entries_.size(),
119+
tail_.load(std::memory_order_acquire));
120+
113121
if (is_capturing_) {
122+
LOG_DEBUG(" Recursive call detected, returning 0\n");
114123
return 0; // Recursive call, bail out
115124
}
116125
is_capturing_ = true;
@@ -119,32 +128,80 @@ class GhostStackImpl {
119128

120129
// Fast path: trampolines installed, return cached frames
121130
if (trampolines_installed_ && !entries_.empty()) {
131+
LOG_DEBUG(" Taking FAST PATH (cached frames)\n");
122132
result = copy_cached_frames(buffer, max_frames);
123133
is_capturing_ = false;
134+
LOG_DEBUG("=== backtrace EXIT (fast path) result=%zu ===\n", result);
124135
return result;
125136
}
126137

127138
// Slow path: capture with unwinder and install trampolines
139+
LOG_DEBUG(" Taking SLOW PATH (capture and install)\n");
140+
141+
// Clear any stale entries from a previous reset before starting fresh capture
142+
if (!entries_.empty() && !trampolines_installed_) {
143+
LOG_DEBUG(" Clearing %zu stale entries from previous reset\n", entries_.size());
144+
entries_.clear();
145+
tail_.store(0, std::memory_order_release);
146+
}
147+
128148
result = capture_and_install(buffer, max_frames);
129149
is_capturing_ = false;
150+
LOG_DEBUG("=== backtrace EXIT (slow path) result=%zu ===\n", result);
130151
return result;
131152
}
132153

133154
/**
134155
* Reset the shadow stack, restoring all original return addresses.
135156
*
136-
* This is the normal reset path - it restores the original return addresses
137-
* to the stack before clearing the shadow stack entries.
157+
* On ARM64, stale trampolines may still fire after reset() because the LR
158+
* register may have already been loaded with the trampoline address before
159+
* we restored the stack location. We keep entries_ around to handle these
160+
* stale trampolines gracefully.
161+
*
162+
* We restore ALL entries (not just 0 to tail-1) but only if the location
163+
* still contains the trampoline address. This handles the case where a
164+
* location was reused by a new frame after its original trampoline fired.
138165
*/
139166
void reset() {
167+
LOG_DEBUG("=== reset ENTER ===\n");
168+
LOG_DEBUG(" this=%p, trampolines_installed_=%d, entries_.size()=%zu, tail_=%zu\n",
169+
(void*)this, (int)trampolines_installed_, entries_.size(),
170+
tail_.load(std::memory_order_acquire));
171+
140172
if (trampolines_installed_) {
141-
size_t tail = tail_.load(std::memory_order_acquire);
142-
// With reversed order, iterate from 0 to tail (all entries below tail)
143-
for (size_t i = 0; i < tail; ++i) {
144-
*entries_[i].location = entries_[i].return_address;
173+
uintptr_t tramp_addr = reinterpret_cast<uintptr_t>(ghost_ret_trampoline);
174+
LOG_DEBUG(" Restoring locations that still have trampoline (0x%lx)\n", (unsigned long)tramp_addr);
175+
176+
// Restore ALL entries whose locations still contain the trampoline.
177+
// This handles both pending entries AND already-fired entries whose
178+
// locations haven't been reused by new frames.
179+
for (size_t i = 0; i < entries_.size(); ++i) {
180+
uintptr_t current_value = *entries_[i].location;
181+
// Strip PAC bits before comparison - on ARM64 with PAC enabled,
182+
// the value read from stack may be PAC-signed while tramp_addr is not
183+
uintptr_t stripped_value = ptrauth_strip(current_value);
184+
if (stripped_value == tramp_addr) {
185+
LOG_DEBUG(" [%zu] location=%p, restoring 0x%lx\n",
186+
i, (void*)entries_[i].location, (unsigned long)entries_[i].return_address);
187+
*entries_[i].location = entries_[i].return_address;
188+
} else {
189+
LOG_DEBUG(" [%zu] location=%p, skipping (current=0x%lx, not trampoline)\n",
190+
i, (void*)entries_[i].location, (unsigned long)current_value);
191+
}
145192
}
193+
194+
// Mark trampolines as not installed, but DON'T clear entries_!
195+
// On ARM64, stale trampolines may still fire because LR was loaded
196+
// before we restored the stack. Keep entries_ so we can still
197+
// return the correct address.
198+
trampolines_installed_ = false;
199+
200+
// Increment epoch to signal state change
201+
uint64_t new_epoch = epoch_.fetch_add(1, std::memory_order_release) + 1;
202+
LOG_DEBUG(" New epoch=%lu (entries preserved for stale trampolines)\n", (unsigned long)new_epoch);
146203
}
147-
clear_entries();
204+
LOG_DEBUG("=== reset EXIT ===\n");
148205
}
149206

150207
public:
@@ -153,12 +210,22 @@ class GhostStackImpl {
153210
* Decrements tail and returns the return address without longjmp checking.
154211
*/
155212
uintptr_t pop_entry() {
213+
LOG_DEBUG("=== pop_entry ENTER ===\n");
214+
LOG_DEBUG(" this=%p, entries_.size()=%zu, tail_=%zu\n",
215+
(void*)this, entries_.size(), tail_.load(std::memory_order_acquire));
216+
156217
size_t tail = tail_.fetch_sub(1, std::memory_order_acq_rel) - 1;
218+
LOG_DEBUG(" After fetch_sub: tail=%zu\n", tail);
219+
157220
if (tail >= entries_.size()) {
158221
LOG_ERROR("Stack corruption in pop_entry!\n");
222+
LOG_ERROR(" tail=%zu, entries_.size()=%zu\n", tail, entries_.size());
159223
std::abort();
160224
}
161-
return entries_[tail].return_address;
225+
uintptr_t ret = entries_[tail].return_address;
226+
LOG_DEBUG(" Returning address 0x%lx\n", (unsigned long)ret);
227+
LOG_DEBUG("=== pop_entry EXIT ===\n");
228+
return ret;
162229
}
163230

164231
private:
@@ -167,49 +234,114 @@ class GhostStackImpl {
167234
* Increments epoch to invalidate any in-flight trampoline operations.
168235
*/
169236
void clear_entries() {
237+
LOG_DEBUG("=== clear_entries ENTER ===\n");
238+
LOG_DEBUG(" this=%p, entries_.size()=%zu, tail_=%zu, epoch_=%lu\n",
239+
(void*)this, entries_.size(), tail_.load(std::memory_order_acquire),
240+
(unsigned long)epoch_.load(std::memory_order_acquire));
241+
170242
// Increment epoch FIRST to signal any in-flight operations
171-
epoch_.fetch_add(1, std::memory_order_release);
243+
uint64_t new_epoch = epoch_.fetch_add(1, std::memory_order_release) + 1;
244+
LOG_DEBUG(" New epoch=%lu\n", (unsigned long)new_epoch);
172245

173246
entries_.clear();
174247
tail_.store(0, std::memory_order_release);
175248
trampolines_installed_ = false;
249+
LOG_DEBUG("=== clear_entries EXIT ===\n");
176250
}
177251

178252
public:
179253

180254
/**
181255
* Called by trampoline when a function returns.
182256
*
183-
* Uses epoch-based validation to detect if reset() was called during
184-
* execution (e.g., from a signal handler). This prevents accessing
185-
* stale or cleared entries.
186-
*
187-
* Implements longjmp detection by comparing the current stack pointer
188-
* against the expected value. If they don't match, searches backward
189-
* through the shadow stack to find the matching entry (like nwind does).
257+
* Handles three scenarios:
258+
* 1. Normal operation: trampolines installed, decrement tail and return
259+
* 2. Post-reset stale trampoline (ARM64): search entries by SP, don't modify state
260+
* 3. Longjmp detection: SP mismatch, search backward for matching entry
190261
*
191-
* @param sp Stack pointer at return time (for longjmp detection)
262+
* @param sp Stack pointer at return time (for longjmp detection / entry lookup)
192263
* @return Original return address to jump to
193264
*/
194265
uintptr_t on_ret_trampoline(uintptr_t sp) {
195-
// Capture current epoch - if it changes, reset() was called
266+
LOG_DEBUG("=== on_ret_trampoline ENTER ===\n");
267+
LOG_DEBUG(" this=%p, sp=0x%lx\n", (void*)this, (unsigned long)sp);
268+
269+
// Log state
270+
size_t tail_before = tail_.load(std::memory_order_acquire);
271+
size_t entries_size = entries_.size();
272+
LOG_DEBUG(" BEFORE: tail_=%zu, entries_.size()=%zu, trampolines_installed_=%d\n",
273+
tail_before, entries_size, (int)trampolines_installed_);
274+
275+
// =========================================================
276+
// POST-RESET STALE TRAMPOLINE HANDLING (ARM64)
277+
// =========================================================
278+
// On ARM64, reset() may have been called but stale trampolines can still
279+
// fire because LR was loaded before we restored the stack location.
280+
// In this case, trampolines_installed_ is false but entries_ still has data.
281+
//
282+
// Stale trampolines fire in predictable order: the deepest pending frame
283+
// (highest index that wasn't consumed) fires first, then the next one up.
284+
// We simply return entries in order starting from tail_-1 and decrementing.
285+
if (!trampolines_installed_ && !entries_.empty()) {
286+
size_t current_tail = tail_.load(std::memory_order_acquire);
287+
LOG_DEBUG(" POST-RESET stale trampoline! tail_=%zu, entries_.size()=%zu\n",
288+
current_tail, entries_.size());
289+
290+
if (current_tail > 0 && current_tail <= entries_.size()) {
291+
// Return the entry at tail-1 (the deepest pending entry)
292+
size_t idx = current_tail - 1;
293+
uintptr_t ret = entries_[idx].return_address;
294+
295+
// Decrement tail_ for the next stale trampoline (if any)
296+
tail_.store(idx, std::memory_order_release);
297+
298+
LOG_DEBUG(" Returning entry[%zu].return_address=0x%lx\n", idx, (unsigned long)ret);
299+
LOG_DEBUG("=== on_ret_trampoline EXIT (post-reset) ===\n");
300+
return ret;
301+
}
302+
303+
// tail_ is 0 or invalid - this shouldn't happen
304+
LOG_ERROR("POST-RESET trampoline: tail_=%zu is invalid!\n", current_tail);
305+
LOG_ERROR(" entries_.size()=%zu\n", entries_.size());
306+
std::abort();
307+
}
308+
309+
// =========================================================
310+
// NORMAL OPERATION
311+
// =========================================================
312+
// Capture current epoch - if it changes during execution, reset() was called
196313
uint64_t current_epoch = epoch_.load(std::memory_order_acquire);
314+
LOG_DEBUG(" current_epoch=%lu\n", (unsigned long)current_epoch);
197315

198316
// Decrement tail first, like nwind does
199317
size_t tail = tail_.fetch_sub(1, std::memory_order_acq_rel) - 1;
318+
LOG_DEBUG(" AFTER fetch_sub: tail=%zu (was %zu)\n", tail, tail_before);
319+
320+
if (entries_.empty()) {
321+
LOG_ERROR("Stack corruption in trampoline: entries_ is EMPTY!\n");
322+
LOG_ERROR(" tail_before=%zu, entries_.size()=%zu\n", tail_before, entries_size);
323+
LOG_ERROR(" this=%p\n", (void*)this);
324+
std::abort();
325+
}
200326

201-
if (entries_.empty() || tail >= entries_.size()) {
202-
LOG_ERROR("Stack corruption in trampoline!\n");
327+
if (tail >= entries_.size()) {
328+
LOG_ERROR("Stack corruption in trampoline: tail >= entries_.size()!\n");
329+
LOG_ERROR(" tail=%zu, entries_.size()=%zu, tail_before=%zu\n",
330+
tail, entries_.size(), tail_before);
331+
LOG_ERROR(" this=%p\n", (void*)this);
203332
std::abort();
204333
}
205334

206335
auto& entry = entries_[tail];
336+
LOG_DEBUG(" entry[%zu]: ip=0x%lx, return_address=0x%lx, location=%p, stack_pointer=0x%lx\n",
337+
tail, (unsigned long)entry.ip, (unsigned long)entry.return_address,
338+
(void*)entry.location, (unsigned long)entry.stack_pointer);
207339

208340
// Check for longjmp: if SP doesn't match expected, search backward
209341
// through shadow stack for matching entry (frames were skipped)
210342
if (sp != 0 && entry.stack_pointer != 0 && entry.stack_pointer != sp) {
211343
LOG_DEBUG("SP mismatch at index %zu: expected 0x%lx, got 0x%lx - checking for longjmp\n",
212-
tail, entry.stack_pointer, sp);
344+
tail, (unsigned long)entry.stack_pointer, (unsigned long)sp);
213345

214346
// Search backward through shadow stack for matching SP (nwind style)
215347
// Only update tail_ if we find a match - don't corrupt it during search
@@ -229,12 +361,17 @@ class GhostStackImpl {
229361
}
230362

231363
// Verify epoch hasn't changed (reset wasn't called during our execution)
232-
if (epoch_.load(std::memory_order_acquire) != current_epoch) {
364+
uint64_t final_epoch = epoch_.load(std::memory_order_acquire);
365+
if (final_epoch != current_epoch) {
233366
LOG_ERROR("Reset detected during trampoline - aborting\n");
367+
LOG_ERROR(" current_epoch=%lu, final_epoch=%lu\n",
368+
(unsigned long)current_epoch, (unsigned long)final_epoch);
234369
std::abort();
235370
}
236371

237372
uintptr_t ret_addr = entries_[tail].return_address;
373+
LOG_DEBUG(" Returning to address 0x%lx\n", (unsigned long)ret_addr);
374+
LOG_DEBUG("=== on_ret_trampoline EXIT ===\n");
238375
return ret_addr;
239376
}
240377

@@ -251,7 +388,7 @@ class GhostStackImpl {
251388
size_t count = (available < max_frames) ? available : max_frames;
252389

253390
for (size_t i = 0; i < count; ++i) {
254-
buffer[i] = reinterpret_cast<void*>(entries_[i].ip);
391+
buffer[i] = reinterpret_cast<void*>(entries_[count - 1 - i].ip);
255392
}
256393

257394
LOG_DEBUG("Fast path: %zu frames\n", count);
@@ -260,11 +397,16 @@ class GhostStackImpl {
260397

261398
// Capture frames using unwinder, install trampolines
262399
size_t capture_and_install(void** buffer, size_t max_frames) {
400+
LOG_DEBUG("=== capture_and_install ENTER ===\n");
401+
LOG_DEBUG(" this=%p, max_frames=%zu\n", (void*)this, max_frames);
402+
263403
// First, capture IPs using the unwinder
264404
std::vector<void*> raw_frames(max_frames);
265405
size_t raw_count = do_unwind(raw_frames.data(), max_frames);
406+
LOG_DEBUG(" do_unwind returned %zu frames\n", raw_count);
266407

267408
if (raw_count == 0) {
409+
LOG_DEBUG(" No frames captured, returning 0\n");
268410
return 0;
269411
}
270412

@@ -277,6 +419,7 @@ class GhostStackImpl {
277419
unw_cursor_t cursor;
278420
unw_getcontext(&ctx);
279421
unw_init_local(&cursor, &ctx);
422+
LOG_DEBUG(" Initialized libunwind cursor\n");
280423

281424
// Skip internal frames (platform-specific due to backtrace/libunwind differences)
282425
#ifdef __APPLE__
@@ -359,14 +502,22 @@ class GhostStackImpl {
359502
step_result = unw_step(&cursor);
360503
} while (step_result > 0);
361504

505+
LOG_DEBUG(" Collected %zu new entries, found_existing=%d\n", new_entries.size(), (int)found_existing);
506+
362507
// Install trampolines on new entries
363-
for (auto& e : new_entries) {
508+
LOG_DEBUG(" Installing trampolines (trampoline addr=%p):\n", (void*)ghost_ret_trampoline);
509+
for (size_t i = 0; i < new_entries.size(); ++i) {
510+
auto& e = new_entries[i];
511+
LOG_DEBUG(" [%zu] location=%p, old_value=0x%lx, ip=0x%lx, expected_sp=0x%lx\n",
512+
i, (void*)e.location, (unsigned long)*e.location,
513+
(unsigned long)e.ip, (unsigned long)e.stack_pointer);
364514
*e.location = reinterpret_cast<uintptr_t>(ghost_ret_trampoline);
365515
}
366516

367517
// Merge with existing entries if we found a patched frame
368518
if (found_existing && !entries_.empty()) {
369519
size_t tail = tail_.load(std::memory_order_acquire);
520+
LOG_DEBUG(" Merging with %zu existing entries\n", tail);
370521
// With reversed order, entries below tail are still valid
371522
// Insert existing valid entries at the beginning of new_entries
372523
new_entries.insert(new_entries.begin(),
@@ -378,13 +529,17 @@ class GhostStackImpl {
378529
tail_.store(entries_.size(), std::memory_order_release);
379530
trampolines_installed_ = true;
380531

532+
LOG_DEBUG(" Final state: entries_.size()=%zu, tail_=%zu\n",
533+
entries_.size(), tail_.load(std::memory_order_acquire));
534+
381535
// Copy to output buffer - return the IP of each frame (what unw_backtrace returns)
536+
// Reverse order: newest frame at buffer[0], oldest at buffer[count-1]
382537
size_t count = (entries_.size() < max_frames) ? entries_.size() : max_frames;
383538
for (size_t i = 0; i < count; ++i) {
384539
buffer[i] = reinterpret_cast<void*>(entries_[count - 1 - i].ip);
385540
}
386541

387-
LOG_DEBUG("Captured %zu frames\n", count);
542+
LOG_DEBUG("=== capture_and_install EXIT, returning %zu frames ===\n", count);
388543
return count;
389544
}
390545

@@ -453,7 +608,8 @@ static thread_local ThreadLocalInstance t_instance;
453608
static GhostStackImpl& get_instance() {
454609
if (!t_instance.ptr) {
455610
t_instance.ptr = new GhostStackImpl();
456-
LOG_DEBUG("Created new shadow stack instance for thread\n");
611+
LOG_DEBUG("Created new shadow stack instance for thread: this=%p, tid=%lu\n",
612+
(void*)t_instance.ptr, (unsigned long)pthread_self());
457613
}
458614
return *t_instance.ptr;
459615
}
@@ -546,7 +702,13 @@ void ghost_stack_thread_cleanup(void) {
546702

547703
// Called by assembly trampoline
548704
uintptr_t ghost_trampoline_handler(uintptr_t sp) {
549-
return get_instance().on_ret_trampoline(sp);
705+
LOG_DEBUG(">>> ghost_trampoline_handler called, sp=0x%lx, tid=%lu\n",
706+
(unsigned long)sp, (unsigned long)pthread_self());
707+
auto& impl = get_instance();
708+
LOG_DEBUG(">>> got instance=%p\n", (void*)&impl);
709+
uintptr_t result = impl.on_ret_trampoline(sp);
710+
LOG_DEBUG(">>> ghost_trampoline_handler returning 0x%lx\n", (unsigned long)result);
711+
return result;
550712
}
551713

552714
// Called when exception passes through trampoline

0 commit comments

Comments
 (0)