@@ -53,12 +53,12 @@ extern "C" void ghost_ret_trampoline();
5353// ============================================================================
5454
5555#ifdef DEBUG
56- #define LOG_DEBUG (...) do { fprintf (stderr, " [GhostStack] " __VA_ARGS__); fflush (stderr); } while ( 0 )
56+ #define LOG_DEBUG (...) fprintf(stderr, " [GhostStack] " __VA_ARGS__)
5757#else
5858#define LOG_DEBUG (...) ((void )0 )
5959#endif
6060
61- #define LOG_ERROR (...) do { fprintf (stderr, " [GhostStack][ERROR] " __VA_ARGS__); fflush (stderr); } while ( 0 )
61+ #define LOG_ERROR (...) fprintf(stderr, " [GhostStack][ERROR] " __VA_ARGS__)
6262
6363// ============================================================================
6464// Utilities
@@ -85,7 +85,7 @@ static inline uintptr_t ptrauth_strip(uintptr_t val) { return val; }
8585struct StackEntry {
8686 uintptr_t ip; // Instruction pointer of this frame (what to return to caller)
8787 uintptr_t return_address; // Original return address (what we replaced with trampoline)
88- uintptr_t * location; // Where return address lives on the stack
88+ uintptr_t * location; // Where it lives on the stack
8989 uintptr_t stack_pointer; // SP at capture time (for validation)
9090};
9191
@@ -111,18 +111,20 @@ class GhostStackImpl {
111111 // Main capture function - returns number of frames
112112 size_t backtrace (void ** buffer, size_t max_frames) {
113113 if (is_capturing_) {
114- LOG_DEBUG (" backtrace: recursive call, bailing out\n " );
115114 return 0 ; // Recursive call, bail out
116115 }
117116 is_capturing_ = true ;
118117
119118 size_t result = 0 ;
120119
121- // Always use capture_and_install - it handles both cases:
122- // 1. No trampolines installed: full capture + install
123- // 2. Trampolines installed: capture new frames up to trampoline, merge with cached
124- LOG_DEBUG (" backtrace: capture_and_install (trampolines_installed=%d, entries=%zu)\n " ,
125- trampolines_installed_, entries_.size ());
120+ // Fast path: trampolines installed, return cached frames
121+ if (trampolines_installed_ && !entries_.empty ()) {
122+ result = copy_cached_frames (buffer, max_frames);
123+ is_capturing_ = false ;
124+ return result;
125+ }
126+
127+ // Slow path: capture with unwinder and install trampolines
126128 result = capture_and_install (buffer, max_frames);
127129 is_capturing_ = false ;
128130 return result;
@@ -136,14 +138,29 @@ class GhostStackImpl {
136138 */
137139 void reset () {
138140 if (trampolines_installed_) {
139- size_t loc = location_.load (std::memory_order_acquire);
140- for (size_t i = loc; i < entries_.size (); ++i) {
141+ size_t tail = tail_.load (std::memory_order_acquire);
142+ // With reversed order, iterate from 0 to tail (all entries below tail)
143+ for (size_t i = 0 ; i < tail; ++i) {
141144 *entries_[i].location = entries_[i].return_address ;
142145 }
143146 }
144147 clear_entries ();
145148 }
146149
150+ public:
151+ /* *
152+ * Direct entry access method for exception handling.
153+ * Decrements tail and returns the return address without longjmp checking.
154+ */
155+ uintptr_t pop_entry () {
156+ size_t tail = tail_.fetch_sub (1 , std::memory_order_acq_rel) - 1 ;
157+ if (tail >= entries_.size ()) {
158+ LOG_ERROR (" Stack corruption in pop_entry!\n " );
159+ std::abort ();
160+ }
161+ return entries_[tail].return_address ;
162+ }
163+
147164private:
148165 /* *
149166 * Internal helper to clear all state.
@@ -154,7 +171,7 @@ class GhostStackImpl {
154171 epoch_.fetch_add (1 , std::memory_order_release);
155172
156173 entries_.clear ();
157- location_ .store (0 , std::memory_order_release);
174+ tail_ .store (0 , std::memory_order_release);
158175 trampolines_installed_ = false ;
159176 }
160177
@@ -168,7 +185,7 @@ class GhostStackImpl {
168185 * stale or cleared entries.
169186 *
170187 * Implements longjmp detection by comparing the current stack pointer
171- * against the expected value. If they don't match, searches forward
188+ * against the expected value. If they don't match, searches backward
172189 * through the shadow stack to find the matching entry (like nwind does).
173190 *
174191 * @param sp Stack pointer at return time (for longjmp detection)
@@ -178,45 +195,37 @@ class GhostStackImpl {
178195 // Capture current epoch - if it changes, reset() was called
179196 uint64_t current_epoch = epoch_.load (std::memory_order_acquire);
180197
181- size_t loc = location_.load (std::memory_order_acquire);
198+ // Decrement tail first, like nwind does
199+ size_t tail = tail_.fetch_sub (1 , std::memory_order_acq_rel) - 1 ;
182200
183- if (entries_.empty () || loc >= entries_.size ()) {
201+ if (entries_.empty () || tail >= entries_.size ()) {
184202 LOG_ERROR (" Stack corruption in trampoline!\n " );
185203 std::abort ();
186204 }
187205
188- auto & entry = entries_[loc ];
206+ auto & entry = entries_[tail ];
189207
190- // Check for longjmp: if SP doesn't match expected, search forward
208+ // Check for longjmp: if SP doesn't match expected, search backward
191209 // through shadow stack for matching entry (frames were skipped)
192210 if (sp != 0 && entry.stack_pointer != 0 && entry.stack_pointer != sp) {
193211 LOG_DEBUG (" SP mismatch at index %zu: expected 0x%lx, got 0x%lx - checking for longjmp\n " ,
194- loc , entry.stack_pointer , sp);
212+ tail , entry.stack_pointer , sp);
195213
196- // Search forward through shadow stack for matching SP
197- bool found = false ;
198- for (size_t i = loc + 1 ; i < entries_.size (); ++i) {
199- if (entries_[i].stack_pointer == sp) {
214+ // Search backward through shadow stack for matching SP (nwind style)
215+ // Only update tail_ if we find a match - don't corrupt it during search
216+ for (size_t i = tail; i > 0 ; --i) {
217+ if (entries_[i - 1 ].stack_pointer == sp) {
218+ size_t skipped = tail - (i - 1 );
200219 LOG_DEBUG (" longjmp detected: found matching SP at index %zu (skipped %zu frames)\n " ,
201- i, i - loc );
220+ i - 1 , skipped );
202221
203- // Don't restore return addresses for skipped frames - they no longer
204- // exist on the stack after longjmp. Just skip over them.
205- loc = i;
206- location_.store (loc, std::memory_order_release);
207- found = true ;
222+ // Update tail_ to skip all the frames that were bypassed by longjmp
223+ tail_.store (i - 1 , std::memory_order_release);
224+ tail = i - 1 ;
208225 break ;
209226 }
210227 }
211-
212- if (!found) {
213- // No matching entry found - this could be:
214- // 1. A bug in our SP calculation
215- // 2. Stack corruption
216- // 3. Some other unexpected scenario
217- // For now, log and continue with the expected entry
218- LOG_DEBUG (" No matching SP found in shadow stack - continuing with current entry\n " );
219- }
228+ // If no match found, continue with current entry (SP calculation may differ by platform)
220229 }
221230
222231 // Verify epoch hasn't changed (reset wasn't called during our execution)
@@ -225,10 +234,7 @@ class GhostStackImpl {
225234 std::abort ();
226235 }
227236
228- // Re-read location in case it was updated during longjmp handling
229- loc = location_.load (std::memory_order_acquire);
230- uintptr_t ret_addr = entries_[loc].return_address ;
231- location_.fetch_add (1 , std::memory_order_acq_rel);
237+ uintptr_t ret_addr = entries_[tail].return_address ;
232238 return ret_addr;
233239 }
234240
@@ -240,18 +246,15 @@ class GhostStackImpl {
240246 * directly from the shadow stack.
241247 */
242248 size_t copy_cached_frames (void ** buffer, size_t max_frames) {
243- size_t loc = location_ .load (std::memory_order_acquire);
244- size_t available = entries_. size () - loc;
249+ size_t tail = tail_ .load (std::memory_order_acquire);
250+ size_t available = tail; // frames from 0 to tail-1
245251 size_t count = (available < max_frames) ? available : max_frames;
246252
247- LOG_DEBUG (" Fast path: loc=%zu, entries_.size()=%zu, available=%zu, count=%zu\n " ,
248- loc, entries_.size (), available, count);
249-
250253 for (size_t i = 0 ; i < count; ++i) {
251- buffer[i] = reinterpret_cast <void *>(entries_[loc + i].ip );
254+ buffer[i] = reinterpret_cast <void *>(entries_[i].ip );
252255 }
253256
254- LOG_DEBUG (" Fast path: returning %zu frames\n " , count);
257+ LOG_DEBUG (" Fast path: %zu frames\n " , count);
255258 return count;
256259 }
257260
@@ -261,8 +264,6 @@ class GhostStackImpl {
261264 std::vector<void *> raw_frames (max_frames);
262265 size_t raw_count = do_unwind (raw_frames.data (), max_frames);
263266
264- LOG_DEBUG (" capture_and_install: raw_count=%zu from unwinder\n " , raw_count);
265-
266267 if (raw_count == 0 ) {
267268 return 0 ;
268269 }
@@ -286,13 +287,10 @@ class GhostStackImpl {
286287 for (int i = 0 ; i < 3 && unw_step (&cursor) > 0 ; ++i) {}
287288#endif
288289
289- size_t frame_idx = 0 ;
290- LOG_DEBUG (" capture_and_install: walking stack frames (raw_count=%zu)...\n " , raw_count);
291- LOG_DEBUG (" capture_and_install: Comparing raw vs walked frames:\n " );
292-
293290 // Process frames: read current frame, then step to next
294291 // Note: After skip loop, cursor is positioned AT the first frame we want
295292 // We need to read first, then step (not step-then-read)
293+ size_t frame_idx = 0 ;
296294 int step_result;
297295 do {
298296 if (frame_idx >= raw_count) break ;
@@ -301,6 +299,23 @@ class GhostStackImpl {
301299 unw_get_reg (&cursor, UNW_REG_IP, &ip);
302300 unw_get_reg (&cursor, GS_SP_REGISTER, &sp);
303301
302+ // On ARM64, strip PAC (Pointer Authentication Code) bits from IP.
303+ // PAC-signed addresses have authentication bits in the upper bits
304+ // that must be stripped for valid address comparison and symbolization.
305+ #ifdef GS_ARCH_AARCH64
306+ ip = ptrauth_strip (ip);
307+ #endif
308+
309+ // On ARM64 Linux, unw_backtrace returns addresses adjusted by -1
310+ // (to point inside the call instruction for symbolization),
311+ // but unw_get_reg(UNW_REG_IP) returns the raw return address.
312+ // Adjust to match unw_backtrace's behavior for consistency.
313+ #if defined(GS_ARCH_AARCH64) && defined(__linux__)
314+ if (ip > 0 ) {
315+ ip = ip - 1 ;
316+ }
317+ #endif
318+
304319 // Get location where return address is stored
305320 uintptr_t * ret_loc = nullptr ;
306321#ifdef __linux__
@@ -313,10 +328,7 @@ class GhostStackImpl {
313328 // macOS: return address is at fp + sizeof(void*)
314329 ret_loc = reinterpret_cast <uintptr_t *>(sp + sizeof (void *));
315330#endif
316- if (!ret_loc) {
317- LOG_DEBUG (" frame %zu: ret_loc is NULL, stopping\n " , frame_idx);
318- break ;
319- }
331+ if (!ret_loc) break ;
320332
321333 uintptr_t ret_addr = *ret_loc;
322334
@@ -329,46 +341,41 @@ class GhostStackImpl {
329341 // Compare against stripped address since trampoline address doesn't have PAC
330342 if (stripped_ret_addr == reinterpret_cast <uintptr_t >(ghost_ret_trampoline)) {
331343 found_existing = true ;
332- LOG_DEBUG (" frame %zu: Found existing trampoline (ip=0x%lx) \n " , frame_idx, ( unsigned long )ip );
344+ LOG_DEBUG (" Found existing trampoline at frame %zu \n " , frame_idx);
333345 break ;
334346 }
335347
336- LOG_DEBUG (" frame %zu: ip=0x%lx, ret_addr=0x%lx, ret_loc=%p\n " ,
337- frame_idx, (unsigned long )ip, (unsigned long )ret_addr, (void *)ret_loc);
338-
339348 // Store the stack pointer that the trampoline will pass.
340349 // The trampoline passes RSP right after landing (before its stack manipulations).
341350 // When RET executes, it pops the return address, so:
342351 // RSP_trampoline = ret_loc + sizeof(void*)
343352 // This allows longjmp detection by comparing against the stored value.
344353 uintptr_t expected_sp = reinterpret_cast <uintptr_t >(ret_loc) + sizeof (void *);
345354 // Store both IP (for returning to caller) and return_address (for trampoline restoration)
346- new_entries.push_back ({ip, ret_addr, ret_loc, expected_sp});
355+ // Insert at beginning to reverse order (oldest at index 0, newest at end)
356+ new_entries.insert (new_entries.begin (), {ip, ret_addr, ret_loc, expected_sp});
347357 frame_idx++;
348358
349359 step_result = unw_step (&cursor);
350360 } while (step_result > 0 );
351- LOG_DEBUG (" capture_and_install: walked %zu frames, found_existing=%d\n " , frame_idx, found_existing);
352361
353362 // Install trampolines on new entries
354- LOG_DEBUG (" capture_and_install: installing %zu trampolines\n " , new_entries.size ());
355363 for (auto & e : new_entries) {
356364 *e.location = reinterpret_cast <uintptr_t >(ghost_ret_trampoline);
357365 }
358366
359367 // Merge with existing entries if we found a patched frame
360368 if (found_existing && !entries_.empty ()) {
361- size_t loc = location_.load (std::memory_order_acquire);
362- LOG_DEBUG (" capture_and_install: merging with existing entries (loc=%zu, existing entries=%zu)\n " ,
363- loc, entries_.size ());
364- new_entries.insert (new_entries.end (),
365- entries_.begin () + static_cast <long >(loc),
366- entries_.end ());
367- LOG_DEBUG (" capture_and_install: after merge, total entries=%zu\n " , new_entries.size ());
369+ size_t tail = tail_.load (std::memory_order_acquire);
370+ // With reversed order, entries below tail are still valid
371+ // Insert existing valid entries at the beginning of new_entries
372+ new_entries.insert (new_entries.begin (),
373+ entries_.begin (),
374+ entries_.begin () + tail);
368375 }
369376
370377 entries_ = std::move (new_entries);
371- location_ .store (0 , std::memory_order_release);
378+ tail_ .store (entries_. size () , std::memory_order_release);
372379 trampolines_installed_ = true ;
373380
374381 // Copy to output buffer - return the IP of each frame (what unw_backtrace returns)
@@ -377,7 +384,7 @@ class GhostStackImpl {
377384 buffer[i] = reinterpret_cast <void *>(entries_[i].ip );
378385 }
379386
380- LOG_DEBUG (" Captured %zu frames (total entries=%zu) \n " , count, entries_. size () );
387+ LOG_DEBUG (" Captured %zu frames\n " , count);
381388 return count;
382389 }
383390
@@ -402,7 +409,7 @@ class GhostStackImpl {
402409 std::vector<StackEntry> entries_;
403410
404411 // Current position in the shadow stack (atomic for signal safety)
405- std::atomic<size_t > location_ {0 };
412+ std::atomic<size_t > tail_ {0 };
406413
407414 // Epoch counter - incremented on reset to invalidate in-flight operations
408415 std::atomic<uint64_t > epoch_{0 };
@@ -494,6 +501,8 @@ extern "C" {
494501void ghost_stack_init (ghost_stack_unwinder_t unwinder) {
495502 std::call_once (g_init_flag, [unwinder]() {
496503 g_custom_unwinder = unwinder;
504+ LOG_DEBUG (" Initialized with %s unwinder\n " ,
505+ unwinder ? " custom" : " default" );
497506 });
498507
499508 // Register fork handler (idempotent, safe to call multiple times)
@@ -544,8 +553,9 @@ uintptr_t ghost_trampoline_handler(uintptr_t sp) {
544553uintptr_t ghost_exception_handler (void * exception) {
545554 LOG_DEBUG (" Exception through trampoline\n " );
546555
547- uintptr_t ret = get_instance ().on_ret_trampoline (0 );
548- get_instance ().reset ();
556+ auto & impl = get_instance ();
557+ uintptr_t ret = impl.pop_entry (); // Direct pop, no longjmp check
558+ impl.reset ();
549559
550560 __cxxabiv1::__cxa_begin_catch (exception);
551561 return ret;
0 commit comments