Skip to content

Commit 9361d1d

Browse files
heyamstrask
andauthored
Turn off Statsbeat when proxy is used or any exception from the server (#2221)
* Turn off statsbeat when proxy used or any exception from the server * Fix comment format * Add a todo * Address feedback * Change todo * Shut down after 3 statsbeat errors * Reset statsbeat error count on success * Use one time success flag and increment max to 10 * No need to reset Co-authored-by: Trask Stalnaker <[email protected]> * Suppress statbeat error logging * Simplify and avoid other unknown cases * move and comment * one more logging * Update to latest spec PR * imports * Add comment * Add TODO Co-authored-by: Trask Stalnaker <[email protected]>
1 parent 1b587fe commit 9361d1d

File tree

2 files changed

+80
-26
lines changed

2 files changed

+80
-26
lines changed

agent/agent-tooling/src/main/java/com/microsoft/applicationinsights/agent/internal/common/OperationLogger.java

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,28 +28,40 @@
2828
// e.g. sending telemetry to the portal, storing telemetry to disk, ...
2929
public class OperationLogger {
3030

31-
private final AggregatingLogger aggregatingLogger;
31+
public static final OperationLogger NOOP = new OperationLogger(null);
32+
33+
@Nullable private final AggregatingLogger aggregatingLogger;
3234

3335
public OperationLogger(Class<?> source, String operation) {
3436
this(source, operation, 300);
3537
}
3638

3739
// visible for testing
3840
OperationLogger(Class<?> source, String operation, int intervalSeconds) {
39-
aggregatingLogger = new AggregatingLogger(source, operation, true, intervalSeconds);
41+
this(new AggregatingLogger(source, operation, true, intervalSeconds));
42+
}
43+
44+
private OperationLogger(@Nullable AggregatingLogger aggregatingLogger) {
45+
this.aggregatingLogger = aggregatingLogger;
4046
}
4147

4248
public void recordSuccess() {
43-
aggregatingLogger.recordSuccess();
49+
if (aggregatingLogger != null) {
50+
aggregatingLogger.recordSuccess();
51+
}
4452
}
4553

4654
// failureMessage should have low cardinality
4755
public void recordFailure(String failureMessage) {
48-
aggregatingLogger.recordWarning(failureMessage);
56+
if (aggregatingLogger != null) {
57+
aggregatingLogger.recordWarning(failureMessage);
58+
}
4959
}
5060

5161
// failureMessage should have low cardinality
5262
public void recordFailure(String failureMessage, @Nullable Throwable exception) {
53-
aggregatingLogger.recordWarning(failureMessage, exception);
63+
if (aggregatingLogger != null) {
64+
aggregatingLogger.recordWarning(failureMessage, exception);
65+
}
5466
}
5567
}

agent/agent-tooling/src/main/java/com/microsoft/applicationinsights/agent/internal/telemetry/TelemetryChannel.java

Lines changed: 63 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
package com.microsoft.applicationinsights.agent.internal.telemetry;
2323

24+
import static java.util.Arrays.asList;
2425
import static java.util.Collections.singletonList;
2526

2627
import com.azure.core.http.HttpMethod;
@@ -49,14 +50,15 @@
4950
import java.io.IOException;
5051
import java.io.StringWriter;
5152
import java.net.URL;
52-
import java.net.UnknownHostException;
5353
import java.nio.ByteBuffer;
54-
import java.nio.channels.UnresolvedAddressException;
5554
import java.util.ArrayList;
5655
import java.util.HashMap;
56+
import java.util.HashSet;
5757
import java.util.List;
5858
import java.util.Map;
59+
import java.util.Set;
5960
import java.util.concurrent.atomic.AtomicBoolean;
61+
import java.util.concurrent.atomic.AtomicInteger;
6062
import java.util.function.Consumer;
6163
import java.util.zip.GZIPOutputStream;
6264
import javax.annotation.Nullable;
@@ -74,18 +76,19 @@ public class TelemetryChannel {
7476

7577
private static final AppInsightsByteBufferPool byteBufferPool = new AppInsightsByteBufferPool();
7678

77-
// TODO (heya) should we suppress logging statsbeat telemetry ingestion issues?
78-
private static final OperationLogger operationLogger =
79-
new OperationLogger(TelemetryChannel.class, "Sending telemetry to the ingestion service");
80-
81-
private static final OperationLogger retryOperationLogger =
82-
new OperationLogger(
83-
TelemetryChannel.class, "Sending telemetry to the ingestion service (retry)");
79+
private final OperationLogger operationLogger;
80+
private final OperationLogger retryOperationLogger;
8481

8582
// TODO (kryalama) do we still need this AtomicBoolean, or can we use throttling built in to the
8683
// operationLogger?
8784
private final AtomicBoolean friendlyExceptionThrown = new AtomicBoolean();
8885

86+
private final AtomicInteger statsbeatUnableToReachBreezeCounter = new AtomicInteger();
87+
// TODO (trask) remove this boolean and shutdown the disk loader for statsbeat instead
88+
private final AtomicBoolean statsbeatHasBeenShutdown = new AtomicBoolean();
89+
90+
private volatile boolean statsbeatHasReachedBreezeAtLeastOnce;
91+
8992
@SuppressWarnings("CatchAndPrintStackTrace")
9093
private static ObjectMapper createObjectMapper() {
9194
ObjectMapper mapper = new ObjectMapper();
@@ -121,6 +124,10 @@ public CompletableResultCode sendRawBytes(
121124
String instrumentationKey,
122125
Runnable onSuccess,
123126
Consumer<Boolean> onFailure) {
127+
if (isStatsbeat && statsbeatHasBeenShutdown.get()) {
128+
// let it be deleted from disk so that it won't keep getting retried
129+
return CompletableResultCode.ofSuccess();
130+
}
124131
return internalSend(
125132
singletonList(buffer), instrumentationKey, onSuccess, onFailure, retryOperationLogger);
126133
}
@@ -137,6 +144,18 @@ public TelemetryChannel(
137144
this.localFileWriter = localFileWriter;
138145
this.statsbeatModule = statsbeatModule;
139146
this.isStatsbeat = isStatsbeat;
147+
148+
if (isStatsbeat) {
149+
// suppress all logging for statsbeat telemetry failures
150+
operationLogger = OperationLogger.NOOP;
151+
retryOperationLogger = OperationLogger.NOOP;
152+
} else {
153+
operationLogger =
154+
new OperationLogger(TelemetryChannel.class, "Sending telemetry to the ingestion service");
155+
retryOperationLogger =
156+
new OperationLogger(
157+
TelemetryChannel.class, "Sending telemetry to the ingestion service (retry)");
158+
}
140159
}
141160

142161
public CompletableResultCode send(List<TelemetryItem> telemetryItems) {
@@ -184,7 +203,7 @@ public CompletableResultCode internalSendByInstrumentationKey(
184203

185204
List<ByteBuffer> encode(List<TelemetryItem> telemetryItems) throws IOException {
186205

187-
if (logger.isDebugEnabled()) {
206+
if (!isStatsbeat && logger.isDebugEnabled()) {
188207
StringWriter debug = new StringWriter();
189208
try (JsonGenerator jg = mapper.createGenerator(debug)) {
190209
writeTelemetryItems(jg, telemetryItems);
@@ -282,6 +301,13 @@ private CompletableResultCode internalSend(
282301
return result;
283302
}
284303

304+
// not including 401/403/503 in this list because those are commonly returned by proxy servers
305+
// when they are not configured to allow traffic for westus-0
306+
// not including 307/308 in this list because redirects only bubble up to this class if they have
307+
// reached the 10 redirect threshold, in which case they are considered non-retryable exceptions
308+
private static final Set<Integer> RESPONSE_CODES_INDICATING_REACHED_BREEZE =
309+
new HashSet<>(asList(200, 206, 402, 408, 429, 439, 500));
310+
285311
private Consumer<HttpResponse> responseHandler(
286312
String instrumentationKey,
287313
long startTime,
@@ -296,6 +322,13 @@ private Consumer<HttpResponse> responseHandler(
296322
.subscribe(
297323
body -> {
298324
int statusCode = response.getStatusCode();
325+
if (isStatsbeat && !statsbeatHasReachedBreezeAtLeastOnce) {
326+
if (RESPONSE_CODES_INDICATING_REACHED_BREEZE.contains(statusCode)) {
327+
statsbeatHasReachedBreezeAtLeastOnce = true;
328+
} else {
329+
statsbeatDidNotReachBreeze();
330+
}
331+
}
299332
switch (statusCode) {
300333
case 200: // SUCCESS
301334
operationLogger.recordSuccess();
@@ -364,19 +397,13 @@ private Consumer<Throwable> errorHandler(
364397
String instrumentationKey, Consumer<Boolean> onFailure, OperationLogger operationLogger) {
365398

366399
return error -> {
367-
if (isStatsbeat
368-
&& (error instanceof UnknownHostException
369-
|| error instanceof UnresolvedAddressException)) {
370-
// when sending a Statsbeat request and server returns an UnknownHostException, it's
371-
// likely that it's using AMPLS. In that case, we use the kill-switch to turn off Statsbeat.
372-
statsbeatModule.shutdown();
373-
onFailure.accept(false);
374-
return;
400+
if (isStatsbeat && !statsbeatHasReachedBreezeAtLeastOnce) {
401+
statsbeatDidNotReachBreeze();
375402
}
376403

377-
// TODO (trask) only log one-time friendly exception if no prior successes
378-
if (!NetworkFriendlyExceptions.logSpecialOneTimeFriendlyException(
379-
error, endpointUrl.toString(), friendlyExceptionThrown, logger)) {
404+
if (!isStatsbeat
405+
&& !NetworkFriendlyExceptions.logSpecialOneTimeFriendlyException(
406+
error, endpointUrl.toString(), friendlyExceptionThrown, logger)) {
380407
operationLogger.recordFailure(
381408
"Error sending telemetry items: " + error.getMessage(), error);
382409
}
@@ -389,6 +416,21 @@ private Consumer<Throwable> errorHandler(
389416
};
390417
}
391418

419+
private void statsbeatDidNotReachBreeze() {
420+
if (statsbeatUnableToReachBreezeCounter.getAndIncrement() >= 10
421+
&& !statsbeatHasBeenShutdown.getAndSet(true)) {
422+
// shutting down statsbeat because it's unlikely that it will ever get through at this point
423+
// some possible reasons:
424+
// * AMPLS
425+
// * proxy that has not been configured to allow westus-0
426+
// * local firewall that has not been configured to allow westus-0
427+
//
428+
// TODO need to figure out a way that statsbeat telemetry can be sent to the same endpoint as
429+
// the customer data for these cases
430+
statsbeatModule.shutdown();
431+
}
432+
}
433+
392434
private static String getErrorMessageFromPartialSuccessResponse(String body) {
393435
JsonNode jsonNode;
394436
try {

0 commit comments

Comments
 (0)