Skip to content

Commit 40f46ee

Browse files
authored
feat(crashtracking): capture unhandled exception with the crashtracker (#5321)
* Remove VS Code config files from tracking * Gitignore weird files that keep popping up (will pop this commit later) Signal based crash report (crash done, need to do ping) Revert "Gitignore weird files that keep popping up (will pop this commit later)" This reverts commit aeb3017. Revert "Remove VS Code config files from tracking" This reverts commit 2b30b86. Use locations array Clean Lazy logging Fix memory leak * Crash ping Fmt fmt Do work on ruby side, fix sus calls Remove noisy log Update symbol name Check result, build message in ruby unit test and test cleanup Inline + no order dependency + cleanup Number of frames logic on ruby side frame processing in helper Restore accidentally deleted comment Update tags on fork Fmt Fix potential mem leak move to core clean Extract into helper Fix more potential leaks Fmt Remove comment from Ruby exception crash reporting context Removed comment about Ruby exception crash reporting tests. Respond to oleg -(rescuing all exceptions) Flip negation No more do-while, crash vs exception naming, test sleep fix, minor refactoring Tag builder helper func, move all logic into ct component, move builder into build function * Comprehensive testing * Timestamp is handled by libdatadog Trigger CI rbs file Trigger CI CI debug We need to explicitly check, not depend on order Be explicit with typing Trigger CI Incomplete stack Clarity in tests * Final cleanups
1 parent 476256d commit 40f46ee

File tree

6 files changed

+429
-21
lines changed

6 files changed

+429
-21
lines changed

ext/libdatadog_api/crashtracker.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
static VALUE _native_start_or_update_on_fork(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _self);
88
static VALUE _native_stop(DDTRACE_UNUSED VALUE _self);
99

10+
void crashtracker_report_exception_init(VALUE crashtracker_class);
11+
1012
static bool first_init = true;
1113

1214
// Used to report Ruby VM crashes.
@@ -18,6 +20,9 @@ void crashtracker_init(VALUE core_module) {
1820

1921
rb_define_singleton_method(crashtracker_class, "_native_start_or_update_on_fork", _native_start_or_update_on_fork, -1);
2022
rb_define_singleton_method(crashtracker_class, "_native_stop", _native_stop, 0);
23+
24+
// Initialize Ruby non-signal-crash reporting
25+
crashtracker_report_exception_init(crashtracker_class);
2126
}
2227

2328
static VALUE _native_start_or_update_on_fork(int argc, VALUE *argv, DDTRACE_UNUSED VALUE _self) {
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
#include <datadog/common.h>
2+
#include <datadog/crashtracker.h>
3+
#include <ruby.h>
4+
#include <sys/types.h>
5+
#include <unistd.h>
6+
#include <string.h>
7+
8+
#include "datadog_ruby_common.h"
9+
10+
static VALUE _native_report_ruby_exception(VALUE _self, VALUE agent_base_url,
11+
VALUE message, VALUE frames_data,
12+
VALUE tags_as_array, VALUE library_version);
13+
14+
static bool process_crash_frames(VALUE frames_data, ddog_crasht_Handle_StackTrace *stack_trace);
15+
static bool build_and_send_crash_report(ddog_crasht_Metadata metadata,
16+
ddog_Endpoint *endpoint,
17+
VALUE message,
18+
VALUE frames_data);
19+
20+
void crashtracker_report_exception_init(VALUE crashtracker_class) {
21+
rb_define_singleton_method(crashtracker_class, "_native_report_ruby_exception",
22+
_native_report_ruby_exception, 5);
23+
}
24+
25+
static VALUE _native_report_ruby_exception(DDTRACE_UNUSED VALUE _self, VALUE agent_base_url,
26+
VALUE message, VALUE frames_data,
27+
VALUE tags_as_array, VALUE library_version) {
28+
ENFORCE_TYPE(agent_base_url, T_STRING);
29+
ENFORCE_TYPE(message, T_STRING);
30+
ENFORCE_TYPE(frames_data, T_ARRAY);
31+
ENFORCE_TYPE(tags_as_array, T_ARRAY);
32+
ENFORCE_TYPE(library_version, T_STRING);
33+
34+
ddog_Endpoint *endpoint = ddog_endpoint_from_url(char_slice_from_ruby_string(agent_base_url));
35+
if (!endpoint) return Qfalse;
36+
37+
ddog_Vec_Tag tags = convert_tags(tags_as_array);
38+
39+
ddog_crasht_Metadata metadata = {
40+
.library_name = DDOG_CHARSLICE_C("dd-trace-rb"),
41+
.library_version = char_slice_from_ruby_string(library_version),
42+
.family = DDOG_CHARSLICE_C("ruby"),
43+
.tags = &tags,
44+
};
45+
46+
bool success = build_and_send_crash_report(metadata, endpoint, message, frames_data);
47+
ddog_Vec_Tag_drop(tags);
48+
ddog_endpoint_drop(endpoint);
49+
50+
return success ? Qtrue : Qfalse;
51+
}
52+
53+
static bool process_crash_frames(VALUE frames_data, ddog_crasht_Handle_StackTrace *stack_trace) {
54+
size_t frame_count = RARRAY_LEN(frames_data);
55+
56+
// Return false and early so we can mark the stack as incomplete
57+
// libdatadog's definition of an incomplete stack is that it has no frames
58+
// or that report generation died in the middle of unwinding frames
59+
if (frame_count == 0) {
60+
return false;
61+
}
62+
63+
for (size_t i = 0; i < frame_count; i++) {
64+
VALUE frame_array = RARRAY_AREF(frames_data, i);
65+
66+
// ruby should guarantee [String, String, Integer]
67+
if (!RB_TYPE_P(frame_array, T_ARRAY) || RARRAY_LEN(frame_array) != 3) {
68+
// Malformed data from Ruby; this is a bug, bail out
69+
return false;
70+
}
71+
72+
VALUE file_val = RARRAY_AREF(frame_array, 0);
73+
VALUE func_val = RARRAY_AREF(frame_array, 1);
74+
VALUE line_val = RARRAY_AREF(frame_array, 2);
75+
76+
// validate types; Ruby should guarantee these
77+
if (!RB_TYPE_P(file_val, T_STRING) || !RB_TYPE_P(func_val, T_STRING) || !RB_TYPE_P(line_val, T_FIXNUM)) {
78+
// Type mismatch from Ruby; this is a bug, bail out
79+
return false;
80+
}
81+
82+
ddog_crasht_StackFrame_NewResult frame_result = ddog_crasht_StackFrame_new();
83+
if (frame_result.tag != DDOG_CRASHT_STACK_FRAME_NEW_RESULT_OK) {
84+
return false;
85+
}
86+
87+
ddog_crasht_Handle_StackFrame *frame = &frame_result.ok;
88+
89+
if (ddog_crasht_StackFrame_with_file(frame, char_slice_from_ruby_string(file_val)).tag != DDOG_VOID_RESULT_OK) {
90+
ddog_crasht_StackFrame_drop(frame);
91+
return false;
92+
}
93+
if (ddog_crasht_StackFrame_with_function(frame, char_slice_from_ruby_string(func_val)).tag != DDOG_VOID_RESULT_OK) {
94+
ddog_crasht_StackFrame_drop(frame);
95+
return false;
96+
}
97+
98+
uint32_t line = (uint32_t)FIX2INT(line_val);
99+
if (line > 0) {
100+
if (ddog_crasht_StackFrame_with_line(frame, line).tag != DDOG_VOID_RESULT_OK) {
101+
ddog_crasht_StackFrame_drop(frame);
102+
return false;
103+
}
104+
}
105+
106+
if (ddog_crasht_StackTrace_push_frame(stack_trace, frame, true).tag != DDOG_VOID_RESULT_OK) {
107+
ddog_crasht_StackFrame_drop(frame);
108+
return false;
109+
}
110+
}
111+
112+
return true;
113+
}
114+
115+
static bool build_and_send_crash_report(ddog_crasht_Metadata metadata,
116+
ddog_Endpoint *endpoint,
117+
VALUE message,
118+
VALUE frames_data) {
119+
ddog_crasht_Handle_StackTrace *stack_trace = NULL;
120+
121+
ddog_crasht_CrashInfoBuilder_NewResult builder_result = ddog_crasht_CrashInfoBuilder_new();
122+
if (builder_result.tag != DDOG_CRASHT_CRASH_INFO_BUILDER_NEW_RESULT_OK) {
123+
return false;
124+
}
125+
126+
ddog_crasht_Handle_CrashInfoBuilder *builder = &builder_result.ok;
127+
128+
// Setup builder metadata and configuration
129+
if (ddog_crasht_CrashInfoBuilder_with_metadata(builder, metadata).tag != DDOG_VOID_RESULT_OK) {
130+
ddog_crasht_CrashInfoBuilder_drop(builder);
131+
return false;
132+
}
133+
134+
if (ddog_crasht_CrashInfoBuilder_with_kind(builder, DDOG_CRASHT_ERROR_KIND_UNHANDLED_EXCEPTION).tag != DDOG_VOID_RESULT_OK) {
135+
ddog_crasht_CrashInfoBuilder_drop(builder);
136+
return false;
137+
}
138+
139+
// Send ping first
140+
if (ddog_crasht_CrashInfoBuilder_upload_ping_to_endpoint(builder, endpoint).tag != DDOG_VOID_RESULT_OK) {
141+
ddog_crasht_CrashInfoBuilder_drop(builder);
142+
return false;
143+
}
144+
145+
ddog_crasht_ProcInfo proc_info = { .pid = (uint32_t)getpid() };
146+
if (ddog_crasht_CrashInfoBuilder_with_proc_info(builder, proc_info).tag != DDOG_VOID_RESULT_OK) {
147+
ddog_crasht_CrashInfoBuilder_drop(builder);
148+
return false;
149+
}
150+
151+
if (ddog_crasht_CrashInfoBuilder_with_os_info_this_machine(builder).tag != DDOG_VOID_RESULT_OK) {
152+
ddog_crasht_CrashInfoBuilder_drop(builder);
153+
return false;
154+
}
155+
156+
if (ddog_crasht_CrashInfoBuilder_with_message(builder, char_slice_from_ruby_string(message)).tag != DDOG_VOID_RESULT_OK) {
157+
ddog_crasht_CrashInfoBuilder_drop(builder);
158+
return false;
159+
}
160+
161+
// Create and populate stack trace
162+
ddog_crasht_StackTrace_NewResult stack_result = ddog_crasht_StackTrace_new();
163+
if (stack_result.tag != DDOG_CRASHT_STACK_TRACE_NEW_RESULT_OK) {
164+
ddog_crasht_CrashInfoBuilder_drop(builder);
165+
return false;
166+
}
167+
168+
stack_trace = &stack_result.ok;
169+
170+
bool frames_processed_successfully = process_crash_frames(frames_data, stack_trace);
171+
172+
// Only mark as complete if we successfully processed all frames
173+
if (frames_processed_successfully) {
174+
if (ddog_crasht_StackTrace_set_complete(stack_trace).tag != DDOG_VOID_RESULT_OK) {
175+
ddog_crasht_StackTrace_drop(stack_trace);
176+
ddog_crasht_CrashInfoBuilder_drop(builder);
177+
return false;
178+
}
179+
}
180+
// If frames processing failed, we still include the stack trace (which may be empty or partial)
181+
// but don't mark it as complete, indicating it's incomplete
182+
183+
if (ddog_crasht_CrashInfoBuilder_with_stack(builder, stack_trace).tag != DDOG_VOID_RESULT_OK) {
184+
ddog_crasht_StackTrace_drop(stack_trace);
185+
ddog_crasht_CrashInfoBuilder_drop(builder);
186+
return false;
187+
}
188+
189+
// Builder takes ownership of stack_trace, so we don't need to clean it up anymore
190+
stack_trace = NULL;
191+
192+
// Build and upload crash info
193+
ddog_crasht_CrashInfo_NewResult crash_info_result = ddog_crasht_CrashInfoBuilder_build(builder);
194+
if (crash_info_result.tag != DDOG_CRASHT_RESULT_HANDLE_CRASH_INFO_OK_HANDLE_CRASH_INFO) {
195+
ddog_crasht_CrashInfoBuilder_drop(builder);
196+
return false;
197+
}
198+
199+
ddog_crasht_Handle_CrashInfo *crash_info = &crash_info_result.ok;
200+
ddog_VoidResult upload_result = ddog_crasht_CrashInfo_upload_to_endpoint(crash_info, endpoint);
201+
bool success = (upload_result.tag == DDOG_VOID_RESULT_OK);
202+
203+
ddog_crasht_CrashInfo_drop(crash_info);
204+
return success;
205+
}

lib/datadog/core.rb

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,14 @@ module Core
2929
# Ensures the Datadog components have a chance to gracefully
3030
# shut down and cleanup before terminating the process.
3131
at_exit do
32-
if Interrupt === $! # rubocop:disable Style/SpecialGlobalVars is process terminating due to a ctrl+c or similar?
32+
exception = $! # rubocop:disable Style/SpecialGlobalVars
33+
34+
if Interrupt === exception # is process terminating due to a ctrl+c or similar?
3335
Datadog.send(:handle_interrupt_shutdown!)
3436
else
37+
# Report unhandled exception to crash tracker before shutdown
38+
Datadog::Core::Crashtracking::Component.report_unhandled_exception(exception)
39+
3540
Datadog.shutdown!
3641
end
3742
end

lib/datadog/core/crashtracking/component.rb

Lines changed: 73 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ module Crashtracking
1717
# Methods prefixed with _native_ are implemented in `crashtracker.c`
1818
class Component
1919
def self.build(settings, agent_settings, logger:)
20-
tags = TagBuilder.call(settings)
20+
tags = latest_tags(settings)
2121
agent_base_url = agent_settings.url
2222

2323
ld_library_path = ::Libdatadog.ld_library_path
@@ -41,6 +41,32 @@ def self.build(settings, agent_settings, logger:)
4141
).tap(&:start)
4242
end
4343

44+
# Reports unhandled exceptions to the crash tracker if available and appropriate.
45+
# This is called from the at_exit hook to report unhandled exceptions.
46+
def self.report_unhandled_exception(exception)
47+
return unless exception && !exception.is_a?(SystemExit) && !exception.is_a?(NoMemoryError)
48+
49+
begin
50+
crashtracker = Datadog.send(:components, allow_initialization: false)&.crashtracker
51+
return unless crashtracker
52+
53+
crashtracker.report_unhandled_exception(exception)
54+
rescue => e
55+
# Unhandled exception report triggering means that the application is already in a bad state
56+
# We don't want to swallow non-StandardError exceptions here; we would rather just let the
57+
# application crash
58+
Datadog.logger.debug("Crashtracker failed to report unhandled exception: #{e.message}")
59+
end
60+
end
61+
62+
# Gets the latest tags from the current configuration.
63+
#
64+
# We always fetch fresh tags because:
65+
# After forking, we need the latest tags, not the parent's tags, such as the pid or runtime-id
66+
def self.latest_tags(settings)
67+
TagBuilder.call(settings)
68+
end
69+
4470
def initialize(tags:, agent_base_url:, ld_library_path:, path_to_crashtracking_receiver_binary:, logger:)
4571
@tags = tags
4672
@agent_base_url = agent_base_url
@@ -54,9 +80,52 @@ def start
5480
end
5581

5682
def update_on_fork(settings: Datadog.configuration)
57-
# Here we pick up the latest settings, so that we pick up any tags that change after forking
58-
# such as the pid or runtime-id
59-
start_or_update_on_fork(action: :update_on_fork, tags: TagBuilder.call(settings))
83+
start_or_update_on_fork(action: :update_on_fork, tags: self.class.latest_tags(settings))
84+
end
85+
86+
def report_unhandled_exception(exception, settings: Datadog.configuration)
87+
# Maximum number of stack frames to include in exception crash reports
88+
# This is the same number used for profiling and signal-based crashtracking
89+
max_exception_stack_frames = 400
90+
91+
current_tags = self.class.latest_tags(settings)
92+
# extract all frame data upfront; c expects exactly 3 elements, proper types, no nils
93+
# limit to max_exception_stack_frames frames
94+
all_backtrace_locations = exception.backtrace_locations || []
95+
was_truncated = all_backtrace_locations.length > max_exception_stack_frames
96+
97+
backtrace_slice = all_backtrace_locations[0...max_exception_stack_frames] || []
98+
# @type var frames_data: Array[[String, String, Integer]]
99+
frames_data = backtrace_slice.map do |loc|
100+
file = loc.path
101+
file = '<unknown>' if file.nil? || file.empty? || !file.is_a?(String)
102+
103+
function = loc.label
104+
function = '<unknown>' if function.nil? || function.empty? || !function.is_a?(String)
105+
106+
line = loc.lineno
107+
line = 0 if line.nil? || line < 0 || !line.is_a?(Integer)
108+
109+
[file, function, line]
110+
end
111+
112+
# Add truncation indicator frame if we had to cut off frames
113+
if was_truncated
114+
truncated_count = all_backtrace_locations.length - max_exception_stack_frames
115+
frames_data << ['<truncated>', "<truncated #{truncated_count} more frames>", 0]
116+
end
117+
118+
message = "Unhandled #{exception.class}: #{exception.message || "<no message>"}"
119+
120+
success = self.class._native_report_ruby_exception(
121+
agent_base_url,
122+
message,
123+
frames_data,
124+
current_tags.to_a,
125+
Datadog::VERSION::STRING
126+
)
127+
128+
logger.debug('Crashtracker failed to report unhandled exception to crash tracker') unless success
60129
end
61130

62131
def stop

sig/datadog/core/crashtracking/component.rbs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@ module Datadog
88
logger: untyped
99
) -> Datadog::Core::Crashtracking::Component?
1010

11+
def self.report_unhandled_exception: (::Exception?) -> void
12+
13+
def self.latest_tags: (Datadog::Core::Configuration::Settings) -> ::Hash[::String, ::String]
14+
1115
def initialize: (
1216
tags: ::Hash[::String, ::String],
1317
agent_base_url: ::String,
@@ -32,6 +36,8 @@ module Datadog
3236

3337
def start_or_update_on_fork: (action: :start | :update_on_fork, tags: ::Hash[::String, ::String]) -> void
3438

39+
def report_unhandled_exception: (::Exception, ?settings: Datadog::Core::Configuration::Settings) -> void
40+
3541
def self._native_start_or_update_on_fork: (
3642
action: :start | :update_on_fork,
3743
agent_base_url: ::String,
@@ -42,6 +48,14 @@ module Datadog
4248
) -> void
4349

4450
def self._native_stop: -> void
51+
52+
def self._native_report_ruby_exception: (
53+
::String agent_base_url,
54+
::String message,
55+
::Array[[::String, ::String, ::Integer]] frames_data,
56+
::Array[[::String, ::String]] tags_as_array,
57+
::String library_version
58+
) -> bool
4559
end
4660
end
4761
end

0 commit comments

Comments
 (0)