@@ -52,13 +52,15 @@ extern "C" void ghost_ret_trampoline();
5252// Logging (minimal, stderr only)
5353// ============================================================================
5454
55- #ifdef DEBUG
56- #define LOG_DEBUG (...) fprintf(stderr, " [GhostStack] " __VA_ARGS__)
55+ // GS_FORCE_DEBUG can be defined via compiler flag (-DGS_FORCE_DEBUG) for test builds
56+ #if defined(DEBUG) || defined(GS_FORCE_DEBUG)
57+ #define LOG_DEBUG (...) do { fprintf (stderr, " [GhostStack][DEBUG] " __VA_ARGS__); fflush (stderr); } while (0 )
5758#else
5859#define LOG_DEBUG (...) ((void )0 )
5960#endif
6061
61- #define LOG_ERROR (...) fprintf(stderr, " [GhostStack][ERROR] " __VA_ARGS__)
62+ #define LOG_ERROR (...) do { fprintf (stderr, " [GhostStack][ERROR] " __VA_ARGS__); fflush (stderr); } while (0 )
63+ #define LOG_INFO (...) do { fprintf (stderr, " [GhostStack][INFO] " __VA_ARGS__); fflush (stderr); } while (0 )
6264
6365// ============================================================================
6466// Utilities
@@ -110,7 +112,14 @@ class GhostStackImpl {
110112
111113 // Main capture function - returns number of frames
112114 size_t backtrace (void ** buffer, size_t max_frames) {
115+ LOG_DEBUG (" === backtrace ENTER ===\n " );
116+ LOG_DEBUG (" this=%p, buffer=%p, max_frames=%zu\n " , (void *)this , (void *)buffer, max_frames);
117+ LOG_DEBUG (" is_capturing_=%d, trampolines_installed_=%d, entries_.size()=%zu, tail_=%zu\n " ,
118+ (int )is_capturing_, (int )trampolines_installed_, entries_.size (),
119+ tail_.load (std::memory_order_acquire));
120+
113121 if (is_capturing_) {
122+ LOG_DEBUG (" Recursive call detected, returning 0\n " );
114123 return 0 ; // Recursive call, bail out
115124 }
116125 is_capturing_ = true ;
@@ -119,32 +128,80 @@ class GhostStackImpl {
119128
120129 // Fast path: trampolines installed, return cached frames
121130 if (trampolines_installed_ && !entries_.empty ()) {
131+ LOG_DEBUG (" Taking FAST PATH (cached frames)\n " );
122132 result = copy_cached_frames (buffer, max_frames);
123133 is_capturing_ = false ;
134+ LOG_DEBUG (" === backtrace EXIT (fast path) result=%zu ===\n " , result);
124135 return result;
125136 }
126137
127138 // Slow path: capture with unwinder and install trampolines
139+ LOG_DEBUG (" Taking SLOW PATH (capture and install)\n " );
140+
141+ // Clear any stale entries from a previous reset before starting fresh capture
142+ if (!entries_.empty () && !trampolines_installed_) {
143+ LOG_DEBUG (" Clearing %zu stale entries from previous reset\n " , entries_.size ());
144+ entries_.clear ();
145+ tail_.store (0 , std::memory_order_release);
146+ }
147+
128148 result = capture_and_install (buffer, max_frames);
129149 is_capturing_ = false ;
150+ LOG_DEBUG (" === backtrace EXIT (slow path) result=%zu ===\n " , result);
130151 return result;
131152 }
132153
133154 /* *
134155 * Reset the shadow stack, restoring all original return addresses.
135156 *
136- * This is the normal reset path - it restores the original return addresses
137- * to the stack before clearing the shadow stack entries.
157+ * On ARM64, stale trampolines may still fire after reset() because the LR
158+ * register may have already been loaded with the trampoline address before
159+ * we restored the stack location. We keep entries_ around to handle these
160+ * stale trampolines gracefully.
161+ *
162+ * We restore ALL entries (not just 0 to tail-1) but only if the location
163+ * still contains the trampoline address. This handles the case where a
164+ * location was reused by a new frame after its original trampoline fired.
138165 */
139166 void reset () {
167+ LOG_DEBUG (" === reset ENTER ===\n " );
168+ LOG_DEBUG (" this=%p, trampolines_installed_=%d, entries_.size()=%zu, tail_=%zu\n " ,
169+ (void *)this , (int )trampolines_installed_, entries_.size (),
170+ tail_.load (std::memory_order_acquire));
171+
140172 if (trampolines_installed_) {
141- size_t tail = tail_.load (std::memory_order_acquire);
142- // With reversed order, iterate from 0 to tail (all entries below tail)
143- for (size_t i = 0 ; i < tail; ++i) {
144- *entries_[i].location = entries_[i].return_address ;
173+ uintptr_t tramp_addr = reinterpret_cast <uintptr_t >(ghost_ret_trampoline);
174+ LOG_DEBUG (" Restoring locations that still have trampoline (0x%lx)\n " , (unsigned long )tramp_addr);
175+
176+ // Restore ALL entries whose locations still contain the trampoline.
177+ // This handles both pending entries AND already-fired entries whose
178+ // locations haven't been reused by new frames.
179+ for (size_t i = 0 ; i < entries_.size (); ++i) {
180+ uintptr_t current_value = *entries_[i].location ;
181+ // Strip PAC bits before comparison - on ARM64 with PAC enabled,
182+ // the value read from stack may be PAC-signed while tramp_addr is not
183+ uintptr_t stripped_value = ptrauth_strip (current_value);
184+ if (stripped_value == tramp_addr) {
185+ LOG_DEBUG (" [%zu] location=%p, restoring 0x%lx\n " ,
186+ i, (void *)entries_[i].location , (unsigned long )entries_[i].return_address );
187+ *entries_[i].location = entries_[i].return_address ;
188+ } else {
189+ LOG_DEBUG (" [%zu] location=%p, skipping (current=0x%lx, not trampoline)\n " ,
190+ i, (void *)entries_[i].location , (unsigned long )current_value);
191+ }
145192 }
193+
194+ // Mark trampolines as not installed, but DON'T clear entries_!
195+ // On ARM64, stale trampolines may still fire because LR was loaded
196+ // before we restored the stack. Keep entries_ so we can still
197+ // return the correct address.
198+ trampolines_installed_ = false ;
199+
200+ // Increment epoch to signal state change
201+ uint64_t new_epoch = epoch_.fetch_add (1 , std::memory_order_release) + 1 ;
202+ LOG_DEBUG (" New epoch=%lu (entries preserved for stale trampolines)\n " , (unsigned long )new_epoch);
146203 }
147- clear_entries ( );
204+ LOG_DEBUG ( " === reset EXIT === \n " );
148205 }
149206
150207public:
@@ -153,12 +210,22 @@ class GhostStackImpl {
153210 * Decrements tail and returns the return address without longjmp checking.
154211 */
155212 uintptr_t pop_entry () {
213+ LOG_DEBUG (" === pop_entry ENTER ===\n " );
214+ LOG_DEBUG (" this=%p, entries_.size()=%zu, tail_=%zu\n " ,
215+ (void *)this , entries_.size (), tail_.load (std::memory_order_acquire));
216+
156217 size_t tail = tail_.fetch_sub (1 , std::memory_order_acq_rel) - 1 ;
218+ LOG_DEBUG (" After fetch_sub: tail=%zu\n " , tail);
219+
157220 if (tail >= entries_.size ()) {
158221 LOG_ERROR (" Stack corruption in pop_entry!\n " );
222+ LOG_ERROR (" tail=%zu, entries_.size()=%zu\n " , tail, entries_.size ());
159223 std::abort ();
160224 }
161- return entries_[tail].return_address ;
225+ uintptr_t ret = entries_[tail].return_address ;
226+ LOG_DEBUG (" Returning address 0x%lx\n " , (unsigned long )ret);
227+ LOG_DEBUG (" === pop_entry EXIT ===\n " );
228+ return ret;
162229 }
163230
164231private:
@@ -167,49 +234,114 @@ class GhostStackImpl {
167234 * Increments epoch to invalidate any in-flight trampoline operations.
168235 */
169236 void clear_entries () {
237+ LOG_DEBUG (" === clear_entries ENTER ===\n " );
238+ LOG_DEBUG (" this=%p, entries_.size()=%zu, tail_=%zu, epoch_=%lu\n " ,
239+ (void *)this , entries_.size (), tail_.load (std::memory_order_acquire),
240+ (unsigned long )epoch_.load (std::memory_order_acquire));
241+
170242 // Increment epoch FIRST to signal any in-flight operations
171- epoch_.fetch_add (1 , std::memory_order_release);
243+ uint64_t new_epoch = epoch_.fetch_add (1 , std::memory_order_release) + 1 ;
244+ LOG_DEBUG (" New epoch=%lu\n " , (unsigned long )new_epoch);
172245
173246 entries_.clear ();
174247 tail_.store (0 , std::memory_order_release);
175248 trampolines_installed_ = false ;
249+ LOG_DEBUG (" === clear_entries EXIT ===\n " );
176250 }
177251
178252public:
179253
180254 /* *
181255 * Called by trampoline when a function returns.
182256 *
183- * Uses epoch-based validation to detect if reset() was called during
184- * execution (e.g., from a signal handler). This prevents accessing
185- * stale or cleared entries.
186- *
187- * Implements longjmp detection by comparing the current stack pointer
188- * against the expected value. If they don't match, searches backward
189- * through the shadow stack to find the matching entry (like nwind does).
257+ * Handles three scenarios:
258+ * 1. Normal operation: trampolines installed, decrement tail and return
259+ * 2. Post-reset stale trampoline (ARM64): search entries by SP, don't modify state
260+ * 3. Longjmp detection: SP mismatch, search backward for matching entry
190261 *
191- * @param sp Stack pointer at return time (for longjmp detection)
262+ * @param sp Stack pointer at return time (for longjmp detection / entry lookup )
192263 * @return Original return address to jump to
193264 */
194265 uintptr_t on_ret_trampoline (uintptr_t sp) {
195- // Capture current epoch - if it changes, reset() was called
266+ LOG_DEBUG (" === on_ret_trampoline ENTER ===\n " );
267+ LOG_DEBUG (" this=%p, sp=0x%lx\n " , (void *)this , (unsigned long )sp);
268+
269+ // Log state
270+ size_t tail_before = tail_.load (std::memory_order_acquire);
271+ size_t entries_size = entries_.size ();
272+ LOG_DEBUG (" BEFORE: tail_=%zu, entries_.size()=%zu, trampolines_installed_=%d\n " ,
273+ tail_before, entries_size, (int )trampolines_installed_);
274+
275+ // =========================================================
276+ // POST-RESET STALE TRAMPOLINE HANDLING (ARM64)
277+ // =========================================================
278+ // On ARM64, reset() may have been called but stale trampolines can still
279+ // fire because LR was loaded before we restored the stack location.
280+ // In this case, trampolines_installed_ is false but entries_ still has data.
281+ //
282+ // Stale trampolines fire in predictable order: the deepest pending frame
283+ // (highest index that wasn't consumed) fires first, then the next one up.
284+ // We simply return entries in order starting from tail_-1 and decrementing.
285+ if (!trampolines_installed_ && !entries_.empty ()) {
286+ size_t current_tail = tail_.load (std::memory_order_acquire);
287+ LOG_DEBUG (" POST-RESET stale trampoline! tail_=%zu, entries_.size()=%zu\n " ,
288+ current_tail, entries_.size ());
289+
290+ if (current_tail > 0 && current_tail <= entries_.size ()) {
291+ // Return the entry at tail-1 (the deepest pending entry)
292+ size_t idx = current_tail - 1 ;
293+ uintptr_t ret = entries_[idx].return_address ;
294+
295+ // Decrement tail_ for the next stale trampoline (if any)
296+ tail_.store (idx, std::memory_order_release);
297+
298+ LOG_DEBUG (" Returning entry[%zu].return_address=0x%lx\n " , idx, (unsigned long )ret);
299+ LOG_DEBUG (" === on_ret_trampoline EXIT (post-reset) ===\n " );
300+ return ret;
301+ }
302+
303+ // tail_ is 0 or invalid - this shouldn't happen
304+ LOG_ERROR (" POST-RESET trampoline: tail_=%zu is invalid!\n " , current_tail);
305+ LOG_ERROR (" entries_.size()=%zu\n " , entries_.size ());
306+ std::abort ();
307+ }
308+
309+ // =========================================================
310+ // NORMAL OPERATION
311+ // =========================================================
312+ // Capture current epoch - if it changes during execution, reset() was called
196313 uint64_t current_epoch = epoch_.load (std::memory_order_acquire);
314+ LOG_DEBUG (" current_epoch=%lu\n " , (unsigned long )current_epoch);
197315
198316 // Decrement tail first, like nwind does
199317 size_t tail = tail_.fetch_sub (1 , std::memory_order_acq_rel) - 1 ;
318+ LOG_DEBUG (" AFTER fetch_sub: tail=%zu (was %zu)\n " , tail, tail_before);
319+
320+ if (entries_.empty ()) {
321+ LOG_ERROR (" Stack corruption in trampoline: entries_ is EMPTY!\n " );
322+ LOG_ERROR (" tail_before=%zu, entries_.size()=%zu\n " , tail_before, entries_size);
323+ LOG_ERROR (" this=%p\n " , (void *)this );
324+ std::abort ();
325+ }
200326
201- if (entries_.empty () || tail >= entries_.size ()) {
202- LOG_ERROR (" Stack corruption in trampoline!\n " );
327+ if (tail >= entries_.size ()) {
328+ LOG_ERROR (" Stack corruption in trampoline: tail >= entries_.size()!\n " );
329+ LOG_ERROR (" tail=%zu, entries_.size()=%zu, tail_before=%zu\n " ,
330+ tail, entries_.size (), tail_before);
331+ LOG_ERROR (" this=%p\n " , (void *)this );
203332 std::abort ();
204333 }
205334
206335 auto & entry = entries_[tail];
336+ LOG_DEBUG (" entry[%zu]: ip=0x%lx, return_address=0x%lx, location=%p, stack_pointer=0x%lx\n " ,
337+ tail, (unsigned long )entry.ip , (unsigned long )entry.return_address ,
338+ (void *)entry.location , (unsigned long )entry.stack_pointer );
207339
208340 // Check for longjmp: if SP doesn't match expected, search backward
209341 // through shadow stack for matching entry (frames were skipped)
210342 if (sp != 0 && entry.stack_pointer != 0 && entry.stack_pointer != sp) {
211343 LOG_DEBUG (" SP mismatch at index %zu: expected 0x%lx, got 0x%lx - checking for longjmp\n " ,
212- tail, entry.stack_pointer , sp);
344+ tail, ( unsigned long ) entry.stack_pointer , ( unsigned long ) sp);
213345
214346 // Search backward through shadow stack for matching SP (nwind style)
215347 // Only update tail_ if we find a match - don't corrupt it during search
@@ -229,12 +361,17 @@ class GhostStackImpl {
229361 }
230362
231363 // Verify epoch hasn't changed (reset wasn't called during our execution)
232- if (epoch_.load (std::memory_order_acquire) != current_epoch) {
364+ uint64_t final_epoch = epoch_.load (std::memory_order_acquire);
365+ if (final_epoch != current_epoch) {
233366 LOG_ERROR (" Reset detected during trampoline - aborting\n " );
367+ LOG_ERROR (" current_epoch=%lu, final_epoch=%lu\n " ,
368+ (unsigned long )current_epoch, (unsigned long )final_epoch);
234369 std::abort ();
235370 }
236371
237372 uintptr_t ret_addr = entries_[tail].return_address ;
373+ LOG_DEBUG (" Returning to address 0x%lx\n " , (unsigned long )ret_addr);
374+ LOG_DEBUG (" === on_ret_trampoline EXIT ===\n " );
238375 return ret_addr;
239376 }
240377
@@ -251,7 +388,7 @@ class GhostStackImpl {
251388 size_t count = (available < max_frames) ? available : max_frames;
252389
253390 for (size_t i = 0 ; i < count; ++i) {
254- buffer[i] = reinterpret_cast <void *>(entries_[i].ip );
391+ buffer[i] = reinterpret_cast <void *>(entries_[count - 1 - i].ip );
255392 }
256393
257394 LOG_DEBUG (" Fast path: %zu frames\n " , count);
@@ -260,11 +397,16 @@ class GhostStackImpl {
260397
261398 // Capture frames using unwinder, install trampolines
262399 size_t capture_and_install (void ** buffer, size_t max_frames) {
400+ LOG_DEBUG (" === capture_and_install ENTER ===\n " );
401+ LOG_DEBUG (" this=%p, max_frames=%zu\n " , (void *)this , max_frames);
402+
263403 // First, capture IPs using the unwinder
264404 std::vector<void *> raw_frames (max_frames);
265405 size_t raw_count = do_unwind (raw_frames.data (), max_frames);
406+ LOG_DEBUG (" do_unwind returned %zu frames\n " , raw_count);
266407
267408 if (raw_count == 0 ) {
409+ LOG_DEBUG (" No frames captured, returning 0\n " );
268410 return 0 ;
269411 }
270412
@@ -277,6 +419,7 @@ class GhostStackImpl {
277419 unw_cursor_t cursor;
278420 unw_getcontext (&ctx);
279421 unw_init_local (&cursor, &ctx);
422+ LOG_DEBUG (" Initialized libunwind cursor\n " );
280423
281424 // Skip internal frames (platform-specific due to backtrace/libunwind differences)
282425#ifdef __APPLE__
@@ -359,14 +502,22 @@ class GhostStackImpl {
359502 step_result = unw_step (&cursor);
360503 } while (step_result > 0 );
361504
505+ LOG_DEBUG (" Collected %zu new entries, found_existing=%d\n " , new_entries.size (), (int )found_existing);
506+
362507 // Install trampolines on new entries
363- for (auto & e : new_entries) {
508+ LOG_DEBUG (" Installing trampolines (trampoline addr=%p):\n " , (void *)ghost_ret_trampoline);
509+ for (size_t i = 0 ; i < new_entries.size (); ++i) {
510+ auto & e = new_entries[i];
511+ LOG_DEBUG (" [%zu] location=%p, old_value=0x%lx, ip=0x%lx, expected_sp=0x%lx\n " ,
512+ i, (void *)e.location , (unsigned long )*e.location ,
513+ (unsigned long )e.ip , (unsigned long )e.stack_pointer );
364514 *e.location = reinterpret_cast <uintptr_t >(ghost_ret_trampoline);
365515 }
366516
367517 // Merge with existing entries if we found a patched frame
368518 if (found_existing && !entries_.empty ()) {
369519 size_t tail = tail_.load (std::memory_order_acquire);
520+ LOG_DEBUG (" Merging with %zu existing entries\n " , tail);
370521 // With reversed order, entries below tail are still valid
371522 // Insert existing valid entries at the beginning of new_entries
372523 new_entries.insert (new_entries.begin (),
@@ -378,13 +529,17 @@ class GhostStackImpl {
378529 tail_.store (entries_.size (), std::memory_order_release);
379530 trampolines_installed_ = true ;
380531
532+ LOG_DEBUG (" Final state: entries_.size()=%zu, tail_=%zu\n " ,
533+ entries_.size (), tail_.load (std::memory_order_acquire));
534+
381535 // Copy to output buffer - return the IP of each frame (what unw_backtrace returns)
536+ // Reverse order: newest frame at buffer[0], oldest at buffer[count-1]
382537 size_t count = (entries_.size () < max_frames) ? entries_.size () : max_frames;
383538 for (size_t i = 0 ; i < count; ++i) {
384539 buffer[i] = reinterpret_cast <void *>(entries_[count - 1 - i].ip );
385540 }
386541
387- LOG_DEBUG (" Captured %zu frames\n " , count);
542+ LOG_DEBUG (" === capture_and_install EXIT, returning %zu frames === \n " , count);
388543 return count;
389544 }
390545
@@ -453,7 +608,8 @@ static thread_local ThreadLocalInstance t_instance;
453608static GhostStackImpl& get_instance () {
454609 if (!t_instance.ptr ) {
455610 t_instance.ptr = new GhostStackImpl ();
456- LOG_DEBUG (" Created new shadow stack instance for thread\n " );
611+ LOG_DEBUG (" Created new shadow stack instance for thread: this=%p, tid=%lu\n " ,
612+ (void *)t_instance.ptr , (unsigned long )pthread_self ());
457613 }
458614 return *t_instance.ptr ;
459615}
@@ -546,7 +702,13 @@ void ghost_stack_thread_cleanup(void) {
546702
547703// Called by assembly trampoline
548704uintptr_t ghost_trampoline_handler (uintptr_t sp) {
549- return get_instance ().on_ret_trampoline (sp);
705+ LOG_DEBUG (" >>> ghost_trampoline_handler called, sp=0x%lx, tid=%lu\n " ,
706+ (unsigned long )sp, (unsigned long )pthread_self ());
707+ auto & impl = get_instance ();
708+ LOG_DEBUG (" >>> got instance=%p\n " , (void *)&impl);
709+ uintptr_t result = impl.on_ret_trampoline (sp);
710+ LOG_DEBUG (" >>> ghost_trampoline_handler returning 0x%lx\n " , (unsigned long )result);
711+ return result;
550712}
551713
552714// Called when exception passes through trampoline
0 commit comments