Skip to content

Commit 4a6e16c

Browse files
authored
Make CBE message creation more robust (#87687) (#87874)
Child circuit breakers rely on proper matching of acquire/release pairs. This can be tricky to get right. If we get it wrong and accidentally double-release a CB then it may end up with a negative `used` value. This is definitely a bad situation in which to find ourselves, but today in production it's made a whole lot worse because it causes exceptions on every attempt to report a `CircuitBreakerStats` or to construct a parent `CircuitBreakingException`. This commit makes the message construction and stats serialization a little more robust so that it's clearer what is going on in production. Relates #86059.
1 parent d38b959 commit 4a6e16c

File tree

5 files changed

+228
-55
lines changed

5 files changed

+228
-55
lines changed

docs/changelog/87687.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 87687
2+
summary: Make CBE message creation more robust
3+
area: Infra/Circuit Breakers
4+
type: bug
5+
issues: []

server/src/main/java/org/elasticsearch/indices/breaker/CircuitBreakerStats.java

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -77,33 +77,43 @@ public double getOverhead() {
7777
@Override
7878
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
7979
builder.startObject(name.toLowerCase(Locale.ROOT));
80-
builder.field(Fields.LIMIT, limit);
81-
builder.field(Fields.LIMIT_HUMAN, new ByteSizeValue(limit));
82-
builder.field(Fields.ESTIMATED, estimated);
83-
builder.field(Fields.ESTIMATED_HUMAN, new ByteSizeValue(estimated));
80+
addBytesFieldsSafe(builder, limit, Fields.LIMIT, Fields.LIMIT_HUMAN);
81+
addBytesFieldsSafe(builder, estimated, Fields.ESTIMATED, Fields.ESTIMATED_HUMAN);
8482
builder.field(Fields.OVERHEAD, overhead);
8583
builder.field(Fields.TRIPPED_COUNT, trippedCount);
8684
builder.endObject();
8785
return builder;
8886
}
8987

88+
private void addBytesFieldsSafe(XContentBuilder builder, long bytes, String rawFieldName, String humanFieldName) throws IOException {
89+
builder.field(rawFieldName, bytes);
90+
if (0 <= bytes) {
91+
builder.field(humanFieldName, new ByteSizeValue(bytes));
92+
} else {
93+
// Something's definitely wrong, maybe a breaker was freed twice? Still, we're just writing out stats here, so we should keep
94+
// going if we're running in production.
95+
assert HierarchyCircuitBreakerService.permitNegativeValues : this;
96+
// noinspection ResultOfMethodCallIgnored - we call toString() to log a warning
97+
toString();
98+
builder.field(humanFieldName, "");
99+
}
100+
}
101+
90102
@Override
91103
public String toString() {
92-
return "["
93-
+ this.name
94-
+ ",limit="
95-
+ this.limit
96-
+ "/"
97-
+ new ByteSizeValue(this.limit)
98-
+ ",estimated="
99-
+ this.estimated
100-
+ "/"
101-
+ new ByteSizeValue(this.estimated)
102-
+ ",overhead="
103-
+ this.overhead
104-
+ ",tripped="
105-
+ this.trippedCount
106-
+ "]";
104+
final var stringBuilder = new StringBuilder();
105+
stringBuilder.append("[");
106+
stringBuilder.append(this.name);
107+
stringBuilder.append(",limit=");
108+
HierarchyCircuitBreakerService.appendBytesSafe(stringBuilder, this.limit);
109+
stringBuilder.append(",estimated=");
110+
HierarchyCircuitBreakerService.appendBytesSafe(stringBuilder, this.estimated);
111+
stringBuilder.append(",overhead=");
112+
stringBuilder.append(this.overhead);
113+
stringBuilder.append(",tripped=");
114+
stringBuilder.append(this.trippedCount);
115+
stringBuilder.append("]");
116+
return stringBuilder.toString();
107117
}
108118

109119
static final class Fields {

server/src/main/java/org/elasticsearch/indices/breaker/HierarchyCircuitBreakerService.java

Lines changed: 70 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@
3535
import java.util.Map;
3636
import java.util.concurrent.atomic.AtomicLong;
3737
import java.util.concurrent.locks.ReentrantLock;
38+
import java.util.function.BiConsumer;
3839
import java.util.function.Function;
3940
import java.util.function.LongSupplier;
40-
import java.util.stream.Collectors;
4141

4242
import static org.elasticsearch.core.Strings.format;
4343
import static org.elasticsearch.indices.breaker.BreakerSettings.CIRCUIT_BREAKER_LIMIT_SETTING;
@@ -397,47 +397,78 @@ public void checkParentLimit(long newBytesReserved, String label) throws Circuit
397397
long parentLimit = this.parentSettings.getLimit();
398398
if (memoryUsed.totalUsage > parentLimit && overLimitStrategy.overLimit(memoryUsed).totalUsage > parentLimit) {
399399
this.parentTripCount.incrementAndGet();
400-
final StringBuilder message = new StringBuilder(
401-
"[parent] Data too large, data for ["
402-
+ label
403-
+ "]"
404-
+ " would be ["
405-
+ memoryUsed.totalUsage
406-
+ "/"
407-
+ new ByteSizeValue(memoryUsed.totalUsage)
408-
+ "]"
409-
+ ", which is larger than the limit of ["
410-
+ parentLimit
411-
+ "/"
412-
+ new ByteSizeValue(parentLimit)
413-
+ "]"
400+
final String messageString = buildParentTripMessage(
401+
newBytesReserved,
402+
label,
403+
memoryUsed,
404+
parentLimit,
405+
this.trackRealMemoryUsage,
406+
this.breakers
414407
);
415-
if (this.trackRealMemoryUsage) {
416-
final long realUsage = memoryUsed.baseUsage;
417-
message.append(", real usage: [");
418-
message.append(realUsage);
419-
message.append("/");
420-
message.append(new ByteSizeValue(realUsage));
421-
message.append("], new bytes reserved: [");
422-
message.append(newBytesReserved);
423-
message.append("/");
424-
message.append(new ByteSizeValue(newBytesReserved));
425-
message.append("]");
426-
}
427-
message.append(", usages [");
428-
message.append(this.breakers.entrySet().stream().map(e -> {
429-
final CircuitBreaker breaker = e.getValue();
430-
final long breakerUsed = (long) (breaker.getUsed() * breaker.getOverhead());
431-
return e.getKey() + "=" + breakerUsed + "/" + new ByteSizeValue(breakerUsed);
432-
}).collect(Collectors.joining(", ")));
433-
message.append("]");
434408
// derive durability of a tripped parent breaker depending on whether the majority of memory tracked by
435409
// child circuit breakers is categorized as transient or permanent.
436410
CircuitBreaker.Durability durability = memoryUsed.transientChildUsage >= memoryUsed.permanentChildUsage
437411
? CircuitBreaker.Durability.TRANSIENT
438412
: CircuitBreaker.Durability.PERMANENT;
439-
logger.debug(() -> format("%s", message.toString()));
440-
throw new CircuitBreakingException(message.toString(), memoryUsed.totalUsage, parentLimit, durability);
413+
logger.debug(() -> format("%s", messageString));
414+
throw new CircuitBreakingException(messageString, memoryUsed.totalUsage, parentLimit, durability);
415+
}
416+
}
417+
418+
// exposed for tests
419+
static String buildParentTripMessage(
420+
long newBytesReserved,
421+
String label,
422+
MemoryUsage memoryUsed,
423+
long parentLimit,
424+
boolean trackRealMemoryUsage,
425+
Map<String, CircuitBreaker> breakers
426+
) {
427+
final var message = new StringBuilder();
428+
message.append("[parent] Data too large, data for [");
429+
message.append(label);
430+
message.append("] would be [");
431+
appendBytesSafe(message, memoryUsed.totalUsage);
432+
message.append("], which is larger than the limit of [");
433+
appendBytesSafe(message, parentLimit);
434+
message.append("]");
435+
if (trackRealMemoryUsage) {
436+
final long realUsage = memoryUsed.baseUsage;
437+
message.append(", real usage: [");
438+
appendBytesSafe(message, realUsage);
439+
message.append("], new bytes reserved: [");
440+
appendBytesSafe(message, newBytesReserved);
441+
message.append("]");
442+
}
443+
message.append(", usages [");
444+
breakers.forEach(new BiConsumer<>() {
445+
private boolean first = true;
446+
447+
@Override
448+
public void accept(String key, CircuitBreaker breaker) {
449+
if (first) {
450+
first = false;
451+
} else {
452+
message.append(", ");
453+
}
454+
message.append(key).append("=");
455+
appendBytesSafe(message, (long) (breaker.getUsed() * breaker.getOverhead()));
456+
}
457+
});
458+
message.append("]");
459+
return message.toString();
460+
}
461+
462+
static void appendBytesSafe(StringBuilder stringBuilder, long bytes) {
463+
stringBuilder.append(bytes);
464+
if (bytes >= 0) {
465+
stringBuilder.append("/");
466+
stringBuilder.append(new ByteSizeValue(bytes));
467+
} else {
468+
// Something's definitely wrong, maybe a breaker was freed twice? Still, we're just creating an exception message here, so we
469+
// should keep going if we're running in production.
470+
logger.error("negative value in circuit breaker: {}", stringBuilder);
471+
assert permitNegativeValues : stringBuilder.toString();
441472
}
442473
}
443474

@@ -636,4 +667,7 @@ TimeValue getLockTimeout() {
636667
return lockTimeout;
637668
}
638669
}
670+
671+
// exposed for testing
672+
static boolean permitNegativeValues = false;
639673
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0 and the Server Side Public License, v 1; you may not use this file except
5+
* in compliance with, at your election, the Elastic License 2.0 or the Server
6+
* Side Public License, v 1.
7+
*/
8+
9+
package org.elasticsearch.indices.breaker;
10+
11+
import org.elasticsearch.common.Strings;
12+
import org.elasticsearch.test.ESTestCase;
13+
import org.elasticsearch.xcontent.ToXContentObject;
14+
15+
import static org.hamcrest.Matchers.equalTo;
16+
17+
public class CircuitBreakerStatsTests extends ESTestCase {
18+
19+
public void testStringRepresentations() {
20+
final var circuitBreakerStats = new CircuitBreakerStats("t", 1L, 2L, 1.0, 3L);
21+
assertThat(circuitBreakerStats.toString(), equalTo("[t,limit=1/1b,estimated=2/2b,overhead=1.0,tripped=3]"));
22+
assertThat(toJson(circuitBreakerStats), equalTo("""
23+
{"t":{"limit_size_in_bytes":1,"limit_size":"1b","estimated_size_in_bytes":2,"estimated_size":"2b","overhead":1.0,"tripped":3}}\
24+
"""));
25+
}
26+
27+
public void testStringRepresentationsWithNegativeStats() {
28+
try {
29+
HierarchyCircuitBreakerService.permitNegativeValues = true;
30+
final var circuitBreakerStats = new CircuitBreakerStats("t", -1L, -2L, 1.0, 3L);
31+
assertThat(circuitBreakerStats.toString(), equalTo("[t,limit=-1,estimated=-2,overhead=1.0,tripped=3]"));
32+
assertThat(toJson(circuitBreakerStats), equalTo("""
33+
{"t":{"limit_size_in_bytes":-1,"limit_size":"","estimated_size_in_bytes":-2,"estimated_size":"",\
34+
"overhead":1.0,"tripped":3}}"""));
35+
} finally {
36+
HierarchyCircuitBreakerService.permitNegativeValues = false;
37+
}
38+
}
39+
40+
private static String toJson(CircuitBreakerStats circuitBreakerStats) {
41+
return Strings.toString((ToXContentObject) (builder, params) -> {
42+
builder.startObject();
43+
circuitBreakerStats.toXContent(builder, params);
44+
builder.endObject();
45+
return builder;
46+
}, false, true);
47+
}
48+
}

server/src/test/java/org/elasticsearch/indices/breaker/HierarchyCircuitBreakerServiceTests.java

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import org.elasticsearch.common.breaker.ChildMemoryCircuitBreaker;
1212
import org.elasticsearch.common.breaker.CircuitBreaker;
1313
import org.elasticsearch.common.breaker.CircuitBreakingException;
14+
import org.elasticsearch.common.breaker.NoopCircuitBreaker;
1415
import org.elasticsearch.common.settings.ClusterSettings;
1516
import org.elasticsearch.common.settings.Settings;
1617
import org.elasticsearch.common.unit.ByteSizeUnit;
@@ -25,6 +26,7 @@
2526
import java.util.Arrays;
2627
import java.util.Collections;
2728
import java.util.List;
29+
import java.util.Map;
2830
import java.util.concurrent.BrokenBarrierException;
2931
import java.util.concurrent.CountDownLatch;
3032
import java.util.concurrent.CyclicBarrier;
@@ -46,6 +48,7 @@
4648
import static org.hamcrest.Matchers.lessThanOrEqualTo;
4749
import static org.hamcrest.Matchers.not;
4850
import static org.hamcrest.Matchers.nullValue;
51+
import static org.hamcrest.Matchers.oneOf;
4952
import static org.hamcrest.Matchers.sameInstance;
5053

5154
public class HierarchyCircuitBreakerServiceTests extends ESTestCase {
@@ -873,4 +876,77 @@ public void testApplySettingForUpdatingUseRealMemory() {
873876
);
874877
}
875878
}
879+
880+
public void testBuildParentTripMessage() {
881+
class TestChildCircuitBreaker extends NoopCircuitBreaker {
882+
private final long used;
883+
884+
TestChildCircuitBreaker(long used) {
885+
super("child");
886+
this.used = used;
887+
}
888+
889+
@Override
890+
public long getUsed() {
891+
return used;
892+
}
893+
894+
@Override
895+
public double getOverhead() {
896+
return 1.0;
897+
}
898+
}
899+
900+
assertThat(
901+
HierarchyCircuitBreakerService.buildParentTripMessage(
902+
1L,
903+
"test",
904+
new HierarchyCircuitBreakerService.MemoryUsage(2L, 3L, 4L, 5L),
905+
6L,
906+
false,
907+
Map.of("child", new TestChildCircuitBreaker(7L), "otherChild", new TestChildCircuitBreaker(8L))
908+
),
909+
oneOf(
910+
"[parent] Data too large, data for [test] would be [3/3b], which is larger than the limit of [6/6b], "
911+
+ "usages [child=7/7b, otherChild=8/8b]",
912+
"[parent] Data too large, data for [test] would be [3/3b], which is larger than the limit of [6/6b], "
913+
+ "usages [otherChild=8/8b, child=7/7b]"
914+
)
915+
);
916+
917+
assertThat(
918+
HierarchyCircuitBreakerService.buildParentTripMessage(
919+
1L,
920+
"test",
921+
new HierarchyCircuitBreakerService.MemoryUsage(2L, 3L, 4L, 5L),
922+
6L,
923+
true,
924+
Map.of()
925+
),
926+
equalTo(
927+
"[parent] Data too large, data for [test] would be [3/3b], which is larger than the limit of [6/6b], "
928+
+ "real usage: [2/2b], new bytes reserved: [1/1b], usages []"
929+
)
930+
);
931+
932+
try {
933+
HierarchyCircuitBreakerService.permitNegativeValues = true;
934+
assertThat(
935+
HierarchyCircuitBreakerService.buildParentTripMessage(
936+
-1L,
937+
"test",
938+
new HierarchyCircuitBreakerService.MemoryUsage(-2L, -3L, -4L, -5L),
939+
-6L,
940+
true,
941+
Map.of("child1", new TestChildCircuitBreaker(-7L))
942+
),
943+
equalTo(
944+
"[parent] Data too large, data for [test] would be [-3], which is larger than the limit of [-6], "
945+
+ "real usage: [-2], new bytes reserved: [-1], usages [child1=-7]"
946+
)
947+
);
948+
} finally {
949+
HierarchyCircuitBreakerService.permitNegativeValues = false;
950+
}
951+
}
876952
}

0 commit comments

Comments
 (0)