Skip to content

Commit e4bc3f6

Browse files
committed
Update ghost unwind
1 parent a7d6f78 commit e4bc3f6

File tree

1 file changed

+87
-77
lines changed

1 file changed

+87
-77
lines changed

src/memray/_memray/ghost_stack/src/ghost_stack.cpp

Lines changed: 87 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,12 @@ extern "C" void ghost_ret_trampoline();
5353
// ============================================================================
5454

5555
#ifdef DEBUG
56-
#define LOG_DEBUG(...) do { fprintf(stderr, "[GhostStack] " __VA_ARGS__); fflush(stderr); } while(0)
56+
#define LOG_DEBUG(...) fprintf(stderr, "[GhostStack] " __VA_ARGS__)
5757
#else
5858
#define LOG_DEBUG(...) ((void)0)
5959
#endif
6060

61-
#define LOG_ERROR(...) do { fprintf(stderr, "[GhostStack][ERROR] " __VA_ARGS__); fflush(stderr); } while(0)
61+
#define LOG_ERROR(...) fprintf(stderr, "[GhostStack][ERROR] " __VA_ARGS__)
6262

6363
// ============================================================================
6464
// Utilities
@@ -85,7 +85,7 @@ static inline uintptr_t ptrauth_strip(uintptr_t val) { return val; }
8585
struct StackEntry {
8686
uintptr_t ip; // Instruction pointer of this frame (what to return to caller)
8787
uintptr_t return_address; // Original return address (what we replaced with trampoline)
88-
uintptr_t* location; // Where return address lives on the stack
88+
uintptr_t* location; // Where it lives on the stack
8989
uintptr_t stack_pointer; // SP at capture time (for validation)
9090
};
9191

@@ -111,18 +111,20 @@ class GhostStackImpl {
111111
// Main capture function - returns number of frames
112112
size_t backtrace(void** buffer, size_t max_frames) {
113113
if (is_capturing_) {
114-
LOG_DEBUG("backtrace: recursive call, bailing out\n");
115114
return 0; // Recursive call, bail out
116115
}
117116
is_capturing_ = true;
118117

119118
size_t result = 0;
120119

121-
// Always use capture_and_install - it handles both cases:
122-
// 1. No trampolines installed: full capture + install
123-
// 2. Trampolines installed: capture new frames up to trampoline, merge with cached
124-
LOG_DEBUG("backtrace: capture_and_install (trampolines_installed=%d, entries=%zu)\n",
125-
trampolines_installed_, entries_.size());
120+
// Fast path: trampolines installed, return cached frames
121+
if (trampolines_installed_ && !entries_.empty()) {
122+
result = copy_cached_frames(buffer, max_frames);
123+
is_capturing_ = false;
124+
return result;
125+
}
126+
127+
// Slow path: capture with unwinder and install trampolines
126128
result = capture_and_install(buffer, max_frames);
127129
is_capturing_ = false;
128130
return result;
@@ -136,14 +138,29 @@ class GhostStackImpl {
136138
*/
137139
void reset() {
138140
if (trampolines_installed_) {
139-
size_t loc = location_.load(std::memory_order_acquire);
140-
for (size_t i = loc; i < entries_.size(); ++i) {
141+
size_t tail = tail_.load(std::memory_order_acquire);
142+
// With reversed order, iterate from 0 to tail (all entries below tail)
143+
for (size_t i = 0; i < tail; ++i) {
141144
*entries_[i].location = entries_[i].return_address;
142145
}
143146
}
144147
clear_entries();
145148
}
146149

150+
public:
151+
/**
152+
* Direct entry access method for exception handling.
153+
* Decrements tail and returns the return address without longjmp checking.
154+
*/
155+
uintptr_t pop_entry() {
156+
size_t tail = tail_.fetch_sub(1, std::memory_order_acq_rel) - 1;
157+
if (tail >= entries_.size()) {
158+
LOG_ERROR("Stack corruption in pop_entry!\n");
159+
std::abort();
160+
}
161+
return entries_[tail].return_address;
162+
}
163+
147164
private:
148165
/**
149166
* Internal helper to clear all state.
@@ -154,7 +171,7 @@ class GhostStackImpl {
154171
epoch_.fetch_add(1, std::memory_order_release);
155172

156173
entries_.clear();
157-
location_.store(0, std::memory_order_release);
174+
tail_.store(0, std::memory_order_release);
158175
trampolines_installed_ = false;
159176
}
160177

@@ -168,7 +185,7 @@ class GhostStackImpl {
168185
* stale or cleared entries.
169186
*
170187
* Implements longjmp detection by comparing the current stack pointer
171-
* against the expected value. If they don't match, searches forward
188+
* against the expected value. If they don't match, searches backward
172189
* through the shadow stack to find the matching entry (like nwind does).
173190
*
174191
* @param sp Stack pointer at return time (for longjmp detection)
@@ -178,45 +195,37 @@ class GhostStackImpl {
178195
// Capture current epoch - if it changes, reset() was called
179196
uint64_t current_epoch = epoch_.load(std::memory_order_acquire);
180197

181-
size_t loc = location_.load(std::memory_order_acquire);
198+
// Decrement tail first, like nwind does
199+
size_t tail = tail_.fetch_sub(1, std::memory_order_acq_rel) - 1;
182200

183-
if (entries_.empty() || loc >= entries_.size()) {
201+
if (entries_.empty() || tail >= entries_.size()) {
184202
LOG_ERROR("Stack corruption in trampoline!\n");
185203
std::abort();
186204
}
187205

188-
auto& entry = entries_[loc];
206+
auto& entry = entries_[tail];
189207

190-
// Check for longjmp: if SP doesn't match expected, search forward
208+
// Check for longjmp: if SP doesn't match expected, search backward
191209
// through shadow stack for matching entry (frames were skipped)
192210
if (sp != 0 && entry.stack_pointer != 0 && entry.stack_pointer != sp) {
193211
LOG_DEBUG("SP mismatch at index %zu: expected 0x%lx, got 0x%lx - checking for longjmp\n",
194-
loc, entry.stack_pointer, sp);
212+
tail, entry.stack_pointer, sp);
195213

196-
// Search forward through shadow stack for matching SP
197-
bool found = false;
198-
for (size_t i = loc + 1; i < entries_.size(); ++i) {
199-
if (entries_[i].stack_pointer == sp) {
214+
// Search backward through shadow stack for matching SP (nwind style)
215+
// Only update tail_ if we find a match - don't corrupt it during search
216+
for (size_t i = tail; i > 0; --i) {
217+
if (entries_[i - 1].stack_pointer == sp) {
218+
size_t skipped = tail - (i - 1);
200219
LOG_DEBUG("longjmp detected: found matching SP at index %zu (skipped %zu frames)\n",
201-
i, i - loc);
220+
i - 1, skipped);
202221

203-
// Don't restore return addresses for skipped frames - they no longer
204-
// exist on the stack after longjmp. Just skip over them.
205-
loc = i;
206-
location_.store(loc, std::memory_order_release);
207-
found = true;
222+
// Update tail_ to skip all the frames that were bypassed by longjmp
223+
tail_.store(i - 1, std::memory_order_release);
224+
tail = i - 1;
208225
break;
209226
}
210227
}
211-
212-
if (!found) {
213-
// No matching entry found - this could be:
214-
// 1. A bug in our SP calculation
215-
// 2. Stack corruption
216-
// 3. Some other unexpected scenario
217-
// For now, log and continue with the expected entry
218-
LOG_DEBUG("No matching SP found in shadow stack - continuing with current entry\n");
219-
}
228+
// If no match found, continue with current entry (SP calculation may differ by platform)
220229
}
221230

222231
// Verify epoch hasn't changed (reset wasn't called during our execution)
@@ -225,10 +234,7 @@ class GhostStackImpl {
225234
std::abort();
226235
}
227236

228-
// Re-read location in case it was updated during longjmp handling
229-
loc = location_.load(std::memory_order_acquire);
230-
uintptr_t ret_addr = entries_[loc].return_address;
231-
location_.fetch_add(1, std::memory_order_acq_rel);
237+
uintptr_t ret_addr = entries_[tail].return_address;
232238
return ret_addr;
233239
}
234240

@@ -240,18 +246,15 @@ class GhostStackImpl {
240246
* directly from the shadow stack.
241247
*/
242248
size_t copy_cached_frames(void** buffer, size_t max_frames) {
243-
size_t loc = location_.load(std::memory_order_acquire);
244-
size_t available = entries_.size() - loc;
249+
size_t tail = tail_.load(std::memory_order_acquire);
250+
size_t available = tail; // frames from 0 to tail-1
245251
size_t count = (available < max_frames) ? available : max_frames;
246252

247-
LOG_DEBUG("Fast path: loc=%zu, entries_.size()=%zu, available=%zu, count=%zu\n",
248-
loc, entries_.size(), available, count);
249-
250253
for (size_t i = 0; i < count; ++i) {
251-
buffer[i] = reinterpret_cast<void*>(entries_[loc + i].ip);
254+
buffer[i] = reinterpret_cast<void*>(entries_[i].ip);
252255
}
253256

254-
LOG_DEBUG("Fast path: returning %zu frames\n", count);
257+
LOG_DEBUG("Fast path: %zu frames\n", count);
255258
return count;
256259
}
257260

@@ -261,8 +264,6 @@ class GhostStackImpl {
261264
std::vector<void*> raw_frames(max_frames);
262265
size_t raw_count = do_unwind(raw_frames.data(), max_frames);
263266

264-
LOG_DEBUG("capture_and_install: raw_count=%zu from unwinder\n", raw_count);
265-
266267
if (raw_count == 0) {
267268
return 0;
268269
}
@@ -286,13 +287,10 @@ class GhostStackImpl {
286287
for (int i = 0; i < 3 && unw_step(&cursor) > 0; ++i) {}
287288
#endif
288289

289-
size_t frame_idx = 0;
290-
LOG_DEBUG("capture_and_install: walking stack frames (raw_count=%zu)...\n", raw_count);
291-
LOG_DEBUG("capture_and_install: Comparing raw vs walked frames:\n");
292-
293290
// Process frames: read current frame, then step to next
294291
// Note: After skip loop, cursor is positioned AT the first frame we want
295292
// We need to read first, then step (not step-then-read)
293+
size_t frame_idx = 0;
296294
int step_result;
297295
do {
298296
if (frame_idx >= raw_count) break;
@@ -301,6 +299,23 @@ class GhostStackImpl {
301299
unw_get_reg(&cursor, UNW_REG_IP, &ip);
302300
unw_get_reg(&cursor, GS_SP_REGISTER, &sp);
303301

302+
// On ARM64, strip PAC (Pointer Authentication Code) bits from IP.
303+
// PAC-signed addresses have authentication bits in the upper bits
304+
// that must be stripped for valid address comparison and symbolization.
305+
#ifdef GS_ARCH_AARCH64
306+
ip = ptrauth_strip(ip);
307+
#endif
308+
309+
// On ARM64 Linux, unw_backtrace returns addresses adjusted by -1
310+
// (to point inside the call instruction for symbolization),
311+
// but unw_get_reg(UNW_REG_IP) returns the raw return address.
312+
// Adjust to match unw_backtrace's behavior for consistency.
313+
#if defined(GS_ARCH_AARCH64) && defined(__linux__)
314+
if (ip > 0) {
315+
ip = ip - 1;
316+
}
317+
#endif
318+
304319
// Get location where return address is stored
305320
uintptr_t* ret_loc = nullptr;
306321
#ifdef __linux__
@@ -313,10 +328,7 @@ class GhostStackImpl {
313328
// macOS: return address is at fp + sizeof(void*)
314329
ret_loc = reinterpret_cast<uintptr_t*>(sp + sizeof(void*));
315330
#endif
316-
if (!ret_loc) {
317-
LOG_DEBUG(" frame %zu: ret_loc is NULL, stopping\n", frame_idx);
318-
break;
319-
}
331+
if (!ret_loc) break;
320332

321333
uintptr_t ret_addr = *ret_loc;
322334

@@ -329,46 +341,41 @@ class GhostStackImpl {
329341
// Compare against stripped address since trampoline address doesn't have PAC
330342
if (stripped_ret_addr == reinterpret_cast<uintptr_t>(ghost_ret_trampoline)) {
331343
found_existing = true;
332-
LOG_DEBUG(" frame %zu: Found existing trampoline (ip=0x%lx)\n", frame_idx, (unsigned long)ip);
344+
LOG_DEBUG("Found existing trampoline at frame %zu\n", frame_idx);
333345
break;
334346
}
335347

336-
LOG_DEBUG(" frame %zu: ip=0x%lx, ret_addr=0x%lx, ret_loc=%p\n",
337-
frame_idx, (unsigned long)ip, (unsigned long)ret_addr, (void*)ret_loc);
338-
339348
// Store the stack pointer that the trampoline will pass.
340349
// The trampoline passes RSP right after landing (before its stack manipulations).
341350
// When RET executes, it pops the return address, so:
342351
// RSP_trampoline = ret_loc + sizeof(void*)
343352
// This allows longjmp detection by comparing against the stored value.
344353
uintptr_t expected_sp = reinterpret_cast<uintptr_t>(ret_loc) + sizeof(void*);
345354
// Store both IP (for returning to caller) and return_address (for trampoline restoration)
346-
new_entries.push_back({ip, ret_addr, ret_loc, expected_sp});
355+
// Insert at beginning to reverse order (oldest at index 0, newest at end)
356+
new_entries.insert(new_entries.begin(), {ip, ret_addr, ret_loc, expected_sp});
347357
frame_idx++;
348358

349359
step_result = unw_step(&cursor);
350360
} while (step_result > 0);
351-
LOG_DEBUG("capture_and_install: walked %zu frames, found_existing=%d\n", frame_idx, found_existing);
352361

353362
// Install trampolines on new entries
354-
LOG_DEBUG("capture_and_install: installing %zu trampolines\n", new_entries.size());
355363
for (auto& e : new_entries) {
356364
*e.location = reinterpret_cast<uintptr_t>(ghost_ret_trampoline);
357365
}
358366

359367
// Merge with existing entries if we found a patched frame
360368
if (found_existing && !entries_.empty()) {
361-
size_t loc = location_.load(std::memory_order_acquire);
362-
LOG_DEBUG("capture_and_install: merging with existing entries (loc=%zu, existing entries=%zu)\n",
363-
loc, entries_.size());
364-
new_entries.insert(new_entries.end(),
365-
entries_.begin() + static_cast<long>(loc),
366-
entries_.end());
367-
LOG_DEBUG("capture_and_install: after merge, total entries=%zu\n", new_entries.size());
369+
size_t tail = tail_.load(std::memory_order_acquire);
370+
// With reversed order, entries below tail are still valid
371+
// Insert existing valid entries at the beginning of new_entries
372+
new_entries.insert(new_entries.begin(),
373+
entries_.begin(),
374+
entries_.begin() + tail);
368375
}
369376

370377
entries_ = std::move(new_entries);
371-
location_.store(0, std::memory_order_release);
378+
tail_.store(entries_.size(), std::memory_order_release);
372379
trampolines_installed_ = true;
373380

374381
// Copy to output buffer - return the IP of each frame (what unw_backtrace returns)
@@ -377,7 +384,7 @@ class GhostStackImpl {
377384
buffer[i] = reinterpret_cast<void*>(entries_[i].ip);
378385
}
379386

380-
LOG_DEBUG("Captured %zu frames (total entries=%zu)\n", count, entries_.size());
387+
LOG_DEBUG("Captured %zu frames\n", count);
381388
return count;
382389
}
383390

@@ -402,7 +409,7 @@ class GhostStackImpl {
402409
std::vector<StackEntry> entries_;
403410

404411
// Current position in the shadow stack (atomic for signal safety)
405-
std::atomic<size_t> location_{0};
412+
std::atomic<size_t> tail_{0};
406413

407414
// Epoch counter - incremented on reset to invalidate in-flight operations
408415
std::atomic<uint64_t> epoch_{0};
@@ -494,6 +501,8 @@ extern "C" {
494501
void ghost_stack_init(ghost_stack_unwinder_t unwinder) {
495502
std::call_once(g_init_flag, [unwinder]() {
496503
g_custom_unwinder = unwinder;
504+
LOG_DEBUG("Initialized with %s unwinder\n",
505+
unwinder ? "custom" : "default");
497506
});
498507

499508
// Register fork handler (idempotent, safe to call multiple times)
@@ -544,8 +553,9 @@ uintptr_t ghost_trampoline_handler(uintptr_t sp) {
544553
uintptr_t ghost_exception_handler(void* exception) {
545554
LOG_DEBUG("Exception through trampoline\n");
546555

547-
uintptr_t ret = get_instance().on_ret_trampoline(0);
548-
get_instance().reset();
556+
auto& impl = get_instance();
557+
uintptr_t ret = impl.pop_entry(); // Direct pop, no longjmp check
558+
impl.reset();
549559

550560
__cxxabiv1::__cxa_begin_catch(exception);
551561
return ret;

0 commit comments

Comments
 (0)