Skip to content

Commit bc074d5

Browse files
committed
Add backtrace to crash logs
Send crash information from the signal handler to traffic_crashlog and record it in crash logs. This also: * Adds a 10-second timeout around backtrace collection to prevent the crashlog helper from hanging indefinitely if ptrace/waitpid blocks. This addresses a TODO comment that has been in the code for a while. * Removes the "Memory Regions" section from the crash logs. These are very rarely useful and otherwise add significant noise to the crash log. A post mordem gdb section on a core should be able to get this information if it is necessary in a particular instance.
1 parent 915c4c7 commit bc074d5

File tree

10 files changed

+439
-62
lines changed

10 files changed

+439
-62
lines changed

doc/appendices/command-line/traffic_crashlog.en.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ for users to run directly.
3737

3838
When :ref:`traffic_server` starts, it will launch a :program:`traffic_crashlog`
3939
process and keep it stopped, activating it only if a crash occurs.
40+
The crash report includes backtraces for all threads, with the crashing thread
41+
listed first.
4042

4143
Options
4244
=======
@@ -48,8 +50,8 @@ Options
4850
This option specifies the host triple for the process that
4951
:program:`traffic_crashlog` should examine. If a supported host
5052
triple is specified, :program:`traffic_crashlog` expects to
51-
receive a ``siginfo_t`` structure on it's standard input,
52-
followed by a ``ucontext_t``.
53+
receive the crashing thread ID, then a ``siginfo_t`` structure on
54+
its standard input, followed by a ``ucontext_t``.
5355

5456
.. option:: --target PID
5557

src/traffic_crashlog/backtrace.cc

Lines changed: 72 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
****************************************************************************/
3131
#include "tscore/ink_config.h"
3232

33+
#include <sys/types.h>
34+
3335
#if TS_USE_REMOTE_UNWINDING
3436
#include "tscore/Diags.h"
3537

@@ -38,7 +40,6 @@
3840
#include <libunwind.h>
3941
#include <libunwind-ptrace.h>
4042
#if defined(__FreeBSD__)
41-
#include <sys/types.h>
4243
#define __WALL P_ALL
4344
#define PTRACE_ATTACH PT_ATTACH
4445
#define PTRACE_DETACH PT_DETACH
@@ -56,6 +57,7 @@ using threadlist = std::vector<pid_t>;
5657

5758
DbgCtl dbg_ctl_backtrace{"backtrace"};
5859

60+
/** Enumerate all threads for a given process by reading /proc/<pid>/task. */
5961
static threadlist
6062
threads_for_process(pid_t proc)
6163
{
@@ -104,58 +106,73 @@ backtrace_for_thread(pid_t threadid, TextBuffer &text)
104106
unw_cursor_t cursor;
105107
void *ap = nullptr;
106108
pid_t target = -1;
107-
unsigned level = 0;
109+
bool attached{false};
110+
unsigned level = 0;
111+
int step_result;
108112

109113
// First, attach to the child, causing it to stop.
110114
status = ptrace(PTRACE_ATTACH, threadid, 0, 0);
111115
if (status < 0) {
116+
text.format(" [ptrace ATTACH failed: %s (%d)]\n", strerror(errno), errno);
112117
Dbg(dbg_ctl_backtrace, "ptrace(ATTACH, %ld) -> %s (%d)\n", (long)threadid, strerror(errno), errno);
113118
return;
114119
}
120+
attached = true;
115121

116-
// Wait for it to stop (XXX should be a timed wait ...)
122+
// Wait for it to stop. The caller uses alarm() to enforce a timeout.
117123
target = waitpid(threadid, &status, __WALL | WUNTRACED);
118124
Dbg(dbg_ctl_backtrace, "waited for target %ld, found PID %ld, %s\n", (long)threadid, (long)target,
119125
WIFSTOPPED(status) ? "STOPPED" : "???");
120126
if (target < 0) {
127+
text.format(" [waitpid failed: %s (%d)]\n", strerror(errno), errno);
121128
goto done;
122129
}
123130

124131
ap = _UPT_create(threadid);
125132
Dbg(dbg_ctl_backtrace, "created UPT %p", ap);
126133
if (ap == nullptr) {
134+
text.format(" [_UPT_create failed]\n");
127135
goto done;
128136
}
129137

130138
addr_space = unw_create_addr_space(&_UPT_accessors, 0 /* byteorder */);
131139
Dbg(dbg_ctl_backtrace, "created address space %p\n", addr_space);
132140
if (addr_space == nullptr) {
141+
text.format(" [unw_create_addr_space failed]\n");
133142
goto done;
134143
}
135144

136145
status = unw_init_remote(&cursor, addr_space, ap);
137146
Dbg(dbg_ctl_backtrace, "unw_init_remote(...) -> %d\n", status);
138147
if (status != 0) {
148+
text.format(" [unw_init_remote failed: %d]\n", status);
139149
goto done;
140150
}
141151

142-
while (unw_step(&cursor) > 0) {
152+
step_result = unw_step(&cursor);
153+
if (step_result <= 0) {
154+
text.format(" [unw_step returned %d on first call]\n", step_result);
155+
}
156+
157+
while (step_result > 0) {
143158
unw_word_t ip;
144-
unw_word_t offset;
159+
unw_word_t offset = 0;
145160
char buf[256];
146161

147162
unw_get_reg(&cursor, UNW_REG_IP, &ip);
148163

149164
if (unw_get_proc_name(&cursor, buf, sizeof(buf), &offset) == 0) {
150-
int status;
151-
char *name = abi::__cxa_demangle(buf, nullptr, nullptr, &status);
152-
text.format("%-4u 0x%016llx %s + %p\n", level, static_cast<unsigned long long>(ip), name ? name : buf, (void *)offset);
165+
int demangle_status;
166+
char *name = abi::__cxa_demangle(buf, nullptr, nullptr, &demangle_status);
167+
text.format("%-4u 0x%016llx %s + 0x%lx\n", level, static_cast<unsigned long long>(ip), name ? name : buf,
168+
static_cast<unsigned long>(offset));
153169
free(name);
154170
} else {
155-
text.format("%-4u 0x%016llx 0x0 + %p\n", level, static_cast<unsigned long long>(ip), (void *)offset);
171+
text.format("%-4u 0x%016llx <unknown>\n", level, static_cast<unsigned long long>(ip));
156172
}
157173

158174
++level;
175+
step_result = unw_step(&cursor);
159176
}
160177

161178
done:
@@ -167,38 +184,65 @@ backtrace_for_thread(pid_t threadid, TextBuffer &text)
167184
_UPT_destroy(ap);
168185
}
169186

170-
status = ptrace(PTRACE_DETACH, target, NULL, DATA_NULL);
171-
Dbg(dbg_ctl_backtrace, "ptrace(DETACH, %ld) -> %d (errno %d)\n", (long)target, status, errno);
187+
if (attached) {
188+
status = ptrace(PTRACE_DETACH, threadid, nullptr, DATA_NULL);
189+
Dbg(dbg_ctl_backtrace, "ptrace(DETACH, %ld) -> %d (errno %d)\n", (long)threadid, status, errno);
190+
}
191+
}
192+
193+
/** Format a thread header with the thread name from /proc. */
194+
static void
195+
format_thread_header(pid_t threadid, const char *prefix, TextBuffer &text)
196+
{
197+
ats_scoped_fd fd;
198+
char path[128];
199+
200+
snprintf(path, sizeof(path), "/proc/%ld/comm", static_cast<long>(threadid));
201+
fd = open(path, O_RDONLY);
202+
if (fd >= 0) {
203+
text.format("%s (TID %ld, ", prefix, static_cast<long>(threadid));
204+
text.readFromFD(fd);
205+
text.chomp();
206+
text.format("):\n");
207+
} else {
208+
text.format("%s (TID %ld):\n", prefix, static_cast<long>(threadid));
209+
}
172210
}
173211
} // namespace
212+
174213
int
175-
ServerBacktrace(unsigned /* options */, int pid, char **trace)
214+
ServerBacktrace(unsigned /* options */, pid_t pid, pid_t crashing_tid, char **trace)
176215
{
177216
*trace = nullptr;
178217

179218
threadlist threads(threads_for_process(pid));
180219
TextBuffer text(0);
181220

182-
Dbg(dbg_ctl_backtrace, "tracing %zd threads for traffic_server PID %ld\n", threads.size(), (long)pid);
221+
Dbg(dbg_ctl_backtrace, "tracing %zd threads for traffic_server PID %ld, crashing TID %ld\n", threads.size(),
222+
static_cast<long>(pid), static_cast<long>(crashing_tid));
183223

224+
// First, trace the crashing thread.
225+
if (crashing_tid > 0) {
226+
Dbg(dbg_ctl_backtrace, "tracing crashing thread %ld\n", static_cast<long>(crashing_tid));
227+
format_thread_header(crashing_tid, "Crashing Thread", text);
228+
backtrace_for_thread(crashing_tid, text);
229+
text.format("\n");
230+
}
231+
232+
// Then trace all other threads.
233+
bool printed_header = false;
184234
for (auto threadid : threads) {
185-
Dbg(dbg_ctl_backtrace, "tracing thread %ld\n", (long)threadid);
186-
// Get the thread name using /proc/PID/comm
187-
ats_scoped_fd fd;
188-
char threadname[128];
189-
190-
snprintf(threadname, sizeof(threadname), "/proc/%ld/comm", static_cast<long>(threadid));
191-
fd = open(threadname, O_RDONLY);
192-
if (fd >= 0) {
193-
text.format("Thread %ld, ", static_cast<long>(threadid));
194-
text.readFromFD(fd);
195-
text.chomp();
196-
} else {
197-
text.format("Thread %ld", static_cast<long>(threadid));
235+
if (threadid == crashing_tid) {
236+
continue;
198237
}
199238

200-
text.format(":\n");
239+
if (!printed_header) {
240+
text.format("Other Non-Crashing Threads:\n\n");
241+
printed_header = true;
242+
}
201243

244+
Dbg(dbg_ctl_backtrace, "tracing thread %ld\n", static_cast<long>(threadid));
245+
format_thread_header(threadid, "Thread", text);
202246
backtrace_for_thread(threadid, text);
203247
text.format("\n");
204248
}
@@ -210,7 +254,7 @@ ServerBacktrace(unsigned /* options */, int pid, char **trace)
210254
#else /* TS_USE_REMOTE_UNWINDING */
211255

212256
int
213-
ServerBacktrace([[maybe_unused]] unsigned options, [[maybe_unused]] int pid, char **trace)
257+
ServerBacktrace([[maybe_unused]] unsigned options, [[maybe_unused]] pid_t pid, [[maybe_unused]] pid_t crashing_tid, char **trace)
214258
{
215259
*trace = nullptr;
216260
return -1;

src/traffic_crashlog/procinfo.cc

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,6 @@ write_procfd_file(const char *filename, const char *label, FILE *fp, const crash
6666
return !text.empty();
6767
}
6868

69-
bool
70-
crashlog_write_regions(FILE *fp, const crashlog_target &target)
71-
{
72-
return write_procfd_file("maps", "Memory Regions", fp, target);
73-
}
74-
7569
bool
7670
crashlog_write_procstatus(FILE *fp, const crashlog_target &target)
7771
{

0 commit comments

Comments
 (0)