Skip to content

Commit 2dca270

Browse files
committed
Handle OOM and clean exit
1 parent 0882768 commit 2dca270

File tree

11 files changed

+338
-25
lines changed

11 files changed

+338
-25
lines changed

dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/CrashUploader.java

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import static datadog.trace.api.config.CrashTrackingConfig.CRASH_TRACKING_UPLOAD_TIMEOUT;
88
import static datadog.trace.api.config.CrashTrackingConfig.CRASH_TRACKING_UPLOAD_TIMEOUT_DEFAULT;
99
import static datadog.trace.api.telemetry.LogCollector.SEND_TELEMETRY;
10+
import static datadog.trace.util.AgentThreadFactory.AgentThread.CRASHTRACKING_HTTP_DISPATCHER;
1011
import static datadog.trace.util.TraceUtils.normalizeServiceName;
1112
import static datadog.trace.util.TraceUtils.normalizeTagValue;
1213
import static java.time.format.DateTimeFormatter.ISO_OFFSET_DATE_TIME;
@@ -24,6 +25,7 @@
2425
import datadog.trace.api.Config;
2526
import datadog.trace.api.DDTags;
2627
import datadog.trace.bootstrap.config.provider.ConfigProvider;
28+
import datadog.trace.util.AgentThreadFactory;
2729
import datadog.trace.util.PidHelper;
2830
import de.thetaphi.forbiddenapis.SuppressForbidden;
2931
import edu.umd.cs.findbugs.annotations.NonNull;
@@ -37,6 +39,10 @@
3739
import java.util.HashMap;
3840
import java.util.Map;
3941
import java.util.Scanner;
42+
import java.util.concurrent.ExecutorService;
43+
import java.util.concurrent.SynchronousQueue;
44+
import java.util.concurrent.ThreadPoolExecutor;
45+
import java.util.concurrent.TimeUnit;
4046
import java.util.regex.Matcher;
4147
import java.util.regex.Pattern;
4248
import java.util.stream.Collectors;
@@ -50,6 +56,7 @@
5056
import okhttp3.RequestBody;
5157
import okhttp3.Response;
5258
import okio.Buffer;
59+
import okio.ByteString;
5360
import org.slf4j.Logger;
5461
import org.slf4j.LoggerFactory;
5562

@@ -112,6 +119,7 @@ public void onResponse(Call call, Response response) throws IOException {
112119
private final HttpUrl errorTrackingUrl;
113120
private final OkHttpClient uploadClient;
114121
private final Dispatcher dispatcher;
122+
private final ExecutorService executor;
115123
private final boolean agentless;
116124
private final String tags;
117125
private final long timeout;
@@ -127,8 +135,16 @@ public CrashUploader(@Nonnull final ConfigManager.StoredConfig storedConfig) {
127135
this.telemetryUrl = HttpUrl.get(config.getFinalCrashTrackingTelemetryUrl());
128136
this.errorTrackingUrl = HttpUrl.get(config.getFinalCrashTrackingErrorTrackingUrl());
129137
this.agentless = config.isCrashTrackingAgentless();
130-
this.dispatcher = new Dispatcher();
131-
dispatcher.setMaxRequests(4);
138+
// This is the same thing OkHttp Dispatcher is doing except thread naming and daemonization
139+
this.executor =
140+
new ThreadPoolExecutor(
141+
0,
142+
4,
143+
60,
144+
TimeUnit.SECONDS,
145+
new SynchronousQueue<>(),
146+
new AgentThreadFactory(CRASHTRACKING_HTTP_DISPATCHER));
147+
this.dispatcher = new Dispatcher(executor);
132148

133149
final StringBuilder tagsBuilder =
134150
new StringBuilder(storedConfig.tags != null ? storedConfig.tags : "");
@@ -227,7 +243,11 @@ public void upload(@Nonnull Path file) {
227243
} catch (Throwable t) {
228244
log.error("Unable to print the error crash as a log message", t);
229245
}
230-
remoteUpload(fileContent, true, true);
246+
try {
247+
remoteUpload(fileContent, true, true);
248+
} finally {
249+
uploadClient.dispatcher().cancelAll();
250+
}
231251
}
232252

233253
// @VisibleForTesting
@@ -258,6 +278,7 @@ void remoteUpload(
258278
}
259279
if (remaining > 0) {
260280
dispatcher.cancelAll();
281+
uploadClient.connectionPool().evictAll();
261282
log.error(
262283
SEND_TELEMETRY,
263284
"Failed to fully send the crash report with UUID {}. Still {} calls remaining",
@@ -491,21 +512,22 @@ private RequestBody makeErrorTrackingRequestBody(@Nonnull CrashLog payload, bool
491512
writer.beginObject();
492513
writer.name("timestamp").value(payload.timestamp);
493514
writer.name("ddsource").value("crashtracker");
515+
// tags
516+
writer.name("ddtags").value(tagsForErrorTracking(payload.uuid, isPing, payload.incomplete));
494517
// error payload
495518
if (payload.error != null) {
496519
writer.name("error");
497520
writer.beginObject();
498-
writer.name("source_type").value("crashtracking");
499521
if (!isPing) {
500522
writer.name("is_crash").value(true);
501523
}
502524
writer.name("type").value(payload.error.kind);
503525
writer.name("message").value(payload.error.message);
526+
writer.name("source_type").value("crashtracking");
504527
if (payload.error.stack != null) {
505528
writer.name("stack");
506-
// payload.error.message
507-
payload.error.stack.writeAsJson(writer);
508529
// flat write an already serialized json object
530+
payload.error.stack.writeAsJson(writer);
509531
}
510532
writer.endObject();
511533
}
@@ -537,11 +559,11 @@ private RequestBody makeErrorTrackingRequestBody(@Nonnull CrashLog payload, bool
537559
"os.version")); // this has been restructured under OsInfo so taking raw here
538560
writer.endObject();
539561
}
540-
// tags
541-
writer.name("ddtags").value(tagsForErrorTracking(payload.uuid, isPing, payload.incomplete));
542562
writer.endObject();
543563
}
544-
return RequestBody.create(APPLICATION_JSON, buf.readByteString());
564+
final ByteString tmp = buf.readByteString();
565+
System.err.println(tmp.utf8());
566+
return RequestBody.create(APPLICATION_JSON, tmp);
545567
}
546568
}
547569

dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/dto/CrashLog.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ public final class CrashLog {
3636
@Json(name = "version_id")
3737
public final int version = VERSION;
3838

39-
// not serialized
40-
public final transient SigInfo sigInfo;
39+
@Json(name = "sig_info")
40+
public final SigInfo sigInfo;
4141

4242
public CrashLog(
4343
String uuid,
@@ -118,6 +118,7 @@ public boolean equalsForTest(Object o) {
118118
&& Objects.equals(timestamp, crashLog.timestamp)
119119
&& Objects.equals(error, crashLog.error)
120120
&& Objects.equals(procInfo, crashLog.procInfo)
121+
&& Objects.equals(sigInfo, crashLog.sigInfo)
121122
&& Objects.equals(dataSchemaVersion, crashLog.dataSchemaVersion);
122123
}
123124
}

dd-java-agent/agent-crashtracking/src/main/java/datadog/crashtracking/parsers/HotspotCrashLogParser.java

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ public final class HotspotCrashLogParser {
2626
DateTimeFormatter.ofPattern("EEE MMM ppd HH:mm:ss yyyy zzz", Locale.getDefault());
2727
private static final DateTimeFormatter OFFSET_DATE_TIME_FORMATTER =
2828
DateTimeFormatter.ofPattern("EEE MMM ppd HH:mm:ss yyyy X", Locale.getDefault());
29+
private static final String OOM_MARKER = "OutOfMemory encountered: ";
2930

3031
enum State {
3132
NEW,
@@ -122,6 +123,7 @@ public CrashLog parse(String uuid, String crashLog) {
122123
List<StackFrame> frames = new ArrayList<>();
123124
String datetime = null;
124125
boolean incomplete = false;
126+
String oomMessage = null;
125127

126128
String[] lines = NEWLINE_SPLITTER.split(crashLog);
127129
outer:
@@ -137,8 +139,11 @@ public CrashLog parse(String uuid, String crashLog) {
137139
if (line.toLowerCase().contains("core dump")) {
138140
// break out of the message block
139141
state = State.HEADER;
140-
} else if (!"#".equals(line)) {
141-
if (sigInfo == null) {
142+
} else if (!"#".equals(line) && (sigInfo == null && oomMessage == null)) {
143+
final int oomIdx = line.indexOf(OOM_MARKER);
144+
if (oomIdx > 0) {
145+
oomMessage = line.substring(oomIdx + OOM_MARKER.length());
146+
} else {
142147
String name = null, address = null;
143148
int number = 0;
144149
// first non-empty line after the message is the signal
@@ -157,7 +162,9 @@ public CrashLog parse(String uuid, String crashLog) {
157162
int endIdx = line.indexOf(',', pcIdx);
158163
address = line.substring(pcIdx + 3, endIdx);
159164
}
160-
sigInfo = new SigInfo(number, name, address);
165+
if (name != null) {
166+
sigInfo = new SigInfo(number, name, address);
167+
}
161168

162169
int pidIdx = line.indexOf("pid=");
163170
if (pidIdx > -1) {
@@ -193,7 +200,10 @@ public CrashLog parse(String uuid, String crashLog) {
193200
state = State.DONE;
194201
} else {
195202
// Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code)
196-
frames.add(parseLine(line));
203+
final StackFrame frame = parseLine(line);
204+
if (frame != null) {
205+
frames.add(frame);
206+
}
197207
}
198208
break;
199209
case DONE:
@@ -209,13 +219,18 @@ public CrashLog parse(String uuid, String crashLog) {
209219
// incomplete crash log
210220
incomplete = true;
211221
}
212-
String message = "Process terminated by signal " + (sigInfo != null ? sigInfo.name : "UNKNOWN");
222+
final String kind;
223+
final String message;
224+
if (oomMessage != null) {
225+
kind = "OutOfMemory";
226+
message = oomMessage;
227+
} else {
228+
kind = sigInfo != null && sigInfo.name != null ? sigInfo.name : "UNKNOWN";
229+
message = "Process terminated by signal " + kind;
230+
}
213231

214232
ErrorData error =
215-
new ErrorData(
216-
sigInfo != null ? sigInfo.name : null,
217-
message,
218-
new StackTrace(frames.toArray(new StackFrame[0])));
233+
new ErrorData(kind, message, new StackTrace(frames.toArray(new StackFrame[0])));
219234
// We can not really extract the full metadata and os info from the crash log
220235
// This code assumes the parser is run on the same machine as the crash happened
221236
Metadata metadata = new Metadata("dd-trace-java", VersionInfo.VERSION, "java", null);

dd-java-agent/agent-crashtracking/src/test/java/datadog/crashtracking/CrashUploaderTest.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,8 @@ public void testErrorTrackingCrashPing() throws Exception {
242242
strings = {
243243
"sample-crash-for-telemetry.txt",
244244
"sample-crash-for-telemetry-2.txt",
245-
"sample-crash-for-telemetry-3.txt"
245+
"sample-crash-for-telemetry-3.txt",
246+
"sample_oom.txt"
246247
})
247248
public void testTelemetryHappyPath(String log) throws Exception {
248249
// Given
@@ -289,7 +290,8 @@ public void testTelemetryHappyPath(String log) throws Exception {
289290
strings = {
290291
"sample-crash-for-telemetry.txt",
291292
"sample-crash-for-telemetry-2.txt",
292-
"sample-crash-for-telemetry-3.txt"
293+
"sample-crash-for-telemetry-3.txt",
294+
"sample_oom.txt"
293295
})
294296
public void testErrorTrackingHappyPath(String log) throws Exception {
295297
// Given
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"timestamp":"2025-11-24T09:43:29+01:00","ddsource":"crashtracker","error":{"is_crash":true,"type":"OutOfMemory","message":"Java heap space","source_type":"crashtracking","stack":{"format":"CrashTrackerV1","frames":[{"file":"libjvm.dylib","function":"VMError::report_and_die(int, char const*, char const*, char*, Thread*, unsigned char*, void*, void*, char const*, int, unsigned long)"},{"file":"libjvm.dylib","function":"report_fatal(VMErrorType, char const*, int, char const*, ...)"},{"file":"libjvm.dylib","function":"report_java_out_of_memory(char const*)"},{"file":"libjvm.dylib","function":"MemAllocator::Allocation::check_out_of_memory()"},{"file":"libjvm.dylib","function":"MemAllocator::allocate() const"},{"file":"libjvm.dylib","function":"CollectedHeap::array_allocate(Klass*, unsigned long, int, bool, JavaThread*)"},{"file":"libjvm.dylib","function":"OptoRuntime::new_array_C(Klass*, int, JavaThread*)"},{"function":"Java"},{"function":" ~RuntimeStub::_new_array_Java 0x00000001124cb638"},{"function":"java.nio.ByteBuffer.allocate(I)Ljava/nio/ByteBuffer;"},{"function":"datadog.communication.serialization.FlushingBuffer.<init>(ILdatadog/communication/serialization/ByteBufferConsumer;)V","line":6},{"function":"datadog.trace.agent.common.writer.PayloadDispatcherImpl.selectMapper()V","line":126},{"function":"datadog.trace.agent.common.writer.PayloadDispatcherImpl.addTrace(Ljava/util/List;)V","line":1},{"function":"datadog.trace.agent.common.writer.TraceProcessingWorker$TraceSerializingHandler.onEvent(Ljava/lang/Object;)V","line":22},{"function":"datadog.trace.agent.common.writer.TraceProcessingWorker$TraceSerializingHandler.consumeFromPrimaryQueue()V","line":21},{"function":"datadog.trace.agent.common.writer.TraceProcessingWorker$TraceSerializingHandler.runDutyCycle()V","line":12},{"function":"datadog.trace.agent.common.writer.TraceProcessingWorker$TraceSerializingHandler.run()V","line":1},{"function":"java.lang.Thread.runWith(Ljava/lang/Object;Ljava/lang/Runnable;)V"},{"function":"java.lang.Thread.run()V"},{"function":" ~StubRoutines::call_stub 0x00000001123b0140"}]}}}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"data_schema_version":"1.0","error":{"is_crash":true,"kind":"SIGSEGV","message":"Process terminated by signal SIGSEGV","source_type":"crashtracking","stack":{"format":"CrashTrackerV1","frames":[{"file":"libpthread.so.0","function":"__pthread_clockjoin_ex"}]}},"incomplete":false,"metadata":{"family":"java","library_name":"dd-trace-java","library_version":"1.56.0-SNAPSHOT~37a6360670"},"os_info":{"architecture":"aarch64","bitness":"64","os_type":"Mac OS X","version":{"Semantic":[15,7,1]}},"proc_info":{"pid":"576034"},"timestamp":"2024-09-20T13:19:06Z","uuid":"a4194cd6-8cb3-45fd-9bd9-3af83e0a3ad3","version_id":0}
1+
{"data_schema_version":"1.0","error":{"is_crash":true,"kind":"SIGSEGV","message":"Process terminated by signal SIGSEGV","source_type":"crashtracking","stack":{"format":"CrashTrackerV1","frames":[{"file":"libpthread.so.0","function":"__pthread_clockjoin_ex"}]}},"incomplete":false,"metadata":{"family":"java","library_name":"dd-trace-java","library_version":"1.57.0-SNAPSHOT~0882768757"},"os_info":{"architecture":"aarch64","bitness":"64","os_type":"Mac OS X","version":{"Semantic":[15,7,1]}},"proc_info":{"pid":"576034"},"sig_info":{"address":"0x00007f011ab1ccd5 (sent by kill)","name":"SIGSEGV","number":11},"timestamp":"2024-09-20T13:19:06Z","uuid":"a4194cd6-8cb3-45fd-9bd9-3af83e0a3ad3","version_id":0}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"data_schema_version":"1.0","error":{"is_crash":true,"kind":"INVALID","message":"Process terminated by signal INVALID","source_type":"crashtracking","stack":{"format":"CrashTrackerV1","frames":[{"file":"libjvm.dylib","function":"VMError::report_and_die()"},{"file":"libjvm.dylib","function":"report_vm_error(char const*, int, char const*, char const*)"},{"file":"libjvm.dylib","function":"report_java_out_of_memory(char const*)"},{"file":"libjvm.dylib","function":"CollectedHeap::common_mem_allocate_noinit(KlassHandle, unsigned long, Thread*)"},{"file":"libjvm.dylib","function":"TypeArrayKlass::allocate_common(int, bool, Thread*)"},{"file":"libjvm.dylib","function":"InterpreterRuntime::newarray(JavaThread*, BasicType, int)"},{"function":"datadog.smoketest.crashtracking.CrashtrackingTestApplication.main([Ljava/lang/String;)V","line":105},{"function":" ~StubRoutines::call_stub"},{"file":"libjvm.dylib","function":"JavaCalls::call_helper(JavaValue*, methodHandle*, JavaCallArguments*, Thread*)"},{"file":"libjvm.dylib","function":"jni_invoke_static(JNIEnv_*, JavaValue*, _jobject*, JNICallType, _jmethodID*, JNI_ArgumentPusher*, Thread*)"},{"file":"libjvm.dylib","function":"jni_CallStaticVoidMethod"},{"file":"java","function":"JavaMain"},{"file":"libsystem_pthread.dylib","function":"_pthread_start"},{"file":"libsystem_pthread.dylib","function":"thread_start"}]}},"incomplete":false,"metadata":{"family":"java","library_name":"dd-trace-java","library_version":"1.56.0-SNAPSHOT~37a6360670"},"os_info":{"architecture":"aarch64","bitness":"64","os_type":"Mac OS X","version":{"Semantic":[15,7,1]}},"proc_info":{"pid":"96267"},"uuid":"a4194cd6-8cb3-45fd-9bd9-3af83e0a3ad3","version_id":0}
1+
{"data_schema_version":"1.0","error":{"is_crash":true,"kind":"INVALID","message":"Process terminated by signal INVALID","source_type":"crashtracking","stack":{"format":"CrashTrackerV1","frames":[{"file":"libjvm.dylib","function":"VMError::report_and_die()"},{"file":"libjvm.dylib","function":"report_vm_error(char const*, int, char const*, char const*)"},{"file":"libjvm.dylib","function":"report_java_out_of_memory(char const*)"},{"file":"libjvm.dylib","function":"CollectedHeap::common_mem_allocate_noinit(KlassHandle, unsigned long, Thread*)"},{"file":"libjvm.dylib","function":"TypeArrayKlass::allocate_common(int, bool, Thread*)"},{"file":"libjvm.dylib","function":"InterpreterRuntime::newarray(JavaThread*, BasicType, int)"},{"function":"datadog.smoketest.crashtracking.CrashtrackingTestApplication.main([Ljava/lang/String;)V","line":105},{"function":" ~StubRoutines::call_stub"},{"file":"libjvm.dylib","function":"JavaCalls::call_helper(JavaValue*, methodHandle*, JavaCallArguments*, Thread*)"},{"file":"libjvm.dylib","function":"jni_invoke_static(JNIEnv_*, JavaValue*, _jobject*, JNICallType, _jmethodID*, JNI_ArgumentPusher*, Thread*)"},{"file":"libjvm.dylib","function":"jni_CallStaticVoidMethod"},{"file":"java","function":"JavaMain"},{"file":"libsystem_pthread.dylib","function":"_pthread_start"},{"file":"libsystem_pthread.dylib","function":"thread_start"}]}},"incomplete":false,"metadata":{"family":"java","library_name":"dd-trace-java","library_version":"1.57.0-SNAPSHOT~0882768757"},"os_info":{"architecture":"aarch64","bitness":"64","os_type":"Mac OS X","version":{"Semantic":[15,7,1]}},"proc_info":{"pid":"96267"},"sig_info":{"address":"0x0000000000000000","name":"INVALID","number":0},"uuid":"a4194cd6-8cb3-45fd-9bd9-3af83e0a3ad3","version_id":0}

0 commit comments

Comments
 (0)