Skip to content

Commit 56d8957

Browse files
authored
Make CBE message creation more robust (#87886)
Child circuit breakers rely on proper matching of acquire/release pairs. This can be tricky to get right. If we get it wrong and accidentally double-release a CB then it may end up with a negative `used` value. This is definitely a bad situation in which to find ourselves, but today in production it's made a whole lot worse because it causes exceptions on every attempt to report a `CircuitBreakerStats` or to construct a parent `CircuitBreakingException`. This commit makes the message construction and stats serialization a little more robust so that it's clearer what is going on in production. Relates #86059 Backport of #87881
1 parent 76e7bd0 commit 56d8957

File tree

5 files changed

+250
-54
lines changed

5 files changed

+250
-54
lines changed

docs/changelog/87881.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 87881
2+
summary: Make CBE message creation more robust
3+
area: Infra/Circuit Breakers
4+
type: bug
5+
issues: []

server/src/main/java/org/elasticsearch/indices/breaker/CircuitBreakerStats.java

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -77,33 +77,43 @@ public double getOverhead() {
7777
@Override
7878
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
7979
builder.startObject(name.toLowerCase(Locale.ROOT));
80-
builder.field(Fields.LIMIT, limit);
81-
builder.field(Fields.LIMIT_HUMAN, new ByteSizeValue(limit));
82-
builder.field(Fields.ESTIMATED, estimated);
83-
builder.field(Fields.ESTIMATED_HUMAN, new ByteSizeValue(estimated));
80+
addBytesFieldsSafe(builder, limit, Fields.LIMIT, Fields.LIMIT_HUMAN);
81+
addBytesFieldsSafe(builder, estimated, Fields.ESTIMATED, Fields.ESTIMATED_HUMAN);
8482
builder.field(Fields.OVERHEAD, overhead);
8583
builder.field(Fields.TRIPPED_COUNT, trippedCount);
8684
builder.endObject();
8785
return builder;
8886
}
8987

88+
private void addBytesFieldsSafe(XContentBuilder builder, long bytes, String rawFieldName, String humanFieldName) throws IOException {
89+
builder.field(rawFieldName, bytes);
90+
if (-1L <= bytes) {
91+
builder.field(humanFieldName, new ByteSizeValue(bytes));
92+
} else {
93+
// Something's definitely wrong, maybe a breaker was freed twice? Still, we're just writing out stats here, so we should keep
94+
// going if we're running in production.
95+
assert HierarchyCircuitBreakerService.permitNegativeValues : this;
96+
// noinspection ResultOfMethodCallIgnored - we call toString() to log a warning
97+
toString();
98+
builder.field(humanFieldName, "");
99+
}
100+
}
101+
90102
@Override
91103
public String toString() {
92-
return "["
93-
+ this.name
94-
+ ",limit="
95-
+ this.limit
96-
+ "/"
97-
+ new ByteSizeValue(this.limit)
98-
+ ",estimated="
99-
+ this.estimated
100-
+ "/"
101-
+ new ByteSizeValue(this.estimated)
102-
+ ",overhead="
103-
+ this.overhead
104-
+ ",tripped="
105-
+ this.trippedCount
106-
+ "]";
104+
final StringBuilder stringBuilder = new StringBuilder();
105+
stringBuilder.append("[");
106+
stringBuilder.append(this.name);
107+
stringBuilder.append(",limit=");
108+
HierarchyCircuitBreakerService.appendBytesSafe(stringBuilder, this.limit);
109+
stringBuilder.append(",estimated=");
110+
HierarchyCircuitBreakerService.appendBytesSafe(stringBuilder, this.estimated);
111+
stringBuilder.append(",overhead=");
112+
stringBuilder.append(this.overhead);
113+
stringBuilder.append(",tripped=");
114+
stringBuilder.append(this.trippedCount);
115+
stringBuilder.append("]");
116+
return stringBuilder.toString();
107117
}
108118

109119
static final class Fields {

server/src/main/java/org/elasticsearch/indices/breaker/HierarchyCircuitBreakerService.java

Lines changed: 70 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import java.util.Map;
3838
import java.util.concurrent.atomic.AtomicLong;
3939
import java.util.concurrent.locks.ReentrantLock;
40+
import java.util.function.BiConsumer;
4041
import java.util.function.Function;
4142
import java.util.function.LongSupplier;
4243
import java.util.stream.Collectors;
@@ -417,47 +418,78 @@ public void checkParentLimit(long newBytesReserved, String label) throws Circuit
417418
long parentLimit = this.parentSettings.getLimit();
418419
if (memoryUsed.totalUsage > parentLimit && overLimitStrategy.overLimit(memoryUsed).totalUsage > parentLimit) {
419420
this.parentTripCount.incrementAndGet();
420-
final StringBuilder message = new StringBuilder(
421-
"[parent] Data too large, data for ["
422-
+ label
423-
+ "]"
424-
+ " would be ["
425-
+ memoryUsed.totalUsage
426-
+ "/"
427-
+ new ByteSizeValue(memoryUsed.totalUsage)
428-
+ "]"
429-
+ ", which is larger than the limit of ["
430-
+ parentLimit
431-
+ "/"
432-
+ new ByteSizeValue(parentLimit)
433-
+ "]"
421+
final String messageString = buildParentTripMessage(
422+
newBytesReserved,
423+
label,
424+
memoryUsed,
425+
parentLimit,
426+
this.trackRealMemoryUsage,
427+
this.breakers
434428
);
435-
if (this.trackRealMemoryUsage) {
436-
final long realUsage = memoryUsed.baseUsage;
437-
message.append(", real usage: [");
438-
message.append(realUsage);
439-
message.append("/");
440-
message.append(new ByteSizeValue(realUsage));
441-
message.append("], new bytes reserved: [");
442-
message.append(newBytesReserved);
443-
message.append("/");
444-
message.append(new ByteSizeValue(newBytesReserved));
445-
message.append("]");
446-
}
447-
message.append(", usages [");
448-
message.append(this.breakers.entrySet().stream().map(e -> {
449-
final CircuitBreaker breaker = e.getValue();
450-
final long breakerUsed = (long) (breaker.getUsed() * breaker.getOverhead());
451-
return e.getKey() + "=" + breakerUsed + "/" + new ByteSizeValue(breakerUsed);
452-
}).collect(Collectors.joining(", ")));
453-
message.append("]");
454429
// derive durability of a tripped parent breaker depending on whether the majority of memory tracked by
455430
// child circuit breakers is categorized as transient or permanent.
456431
CircuitBreaker.Durability durability = memoryUsed.transientChildUsage >= memoryUsed.permanentChildUsage
457432
? CircuitBreaker.Durability.TRANSIENT
458433
: CircuitBreaker.Durability.PERMANENT;
459-
logger.debug(() -> new ParameterizedMessage("{}", message.toString()));
460-
throw new CircuitBreakingException(message.toString(), memoryUsed.totalUsage, parentLimit, durability);
434+
logger.debug(() -> new ParameterizedMessage("{}", messageString));
435+
throw new CircuitBreakingException(messageString, memoryUsed.totalUsage, parentLimit, durability);
436+
}
437+
}
438+
439+
// exposed for tests
440+
static String buildParentTripMessage(
441+
long newBytesReserved,
442+
String label,
443+
MemoryUsage memoryUsed,
444+
long parentLimit,
445+
boolean trackRealMemoryUsage,
446+
Map<String, CircuitBreaker> breakers
447+
) {
448+
final StringBuilder message = new StringBuilder();
449+
message.append("[parent] Data too large, data for [");
450+
message.append(label);
451+
message.append("] would be [");
452+
appendBytesSafe(message, memoryUsed.totalUsage);
453+
message.append("], which is larger than the limit of [");
454+
appendBytesSafe(message, parentLimit);
455+
message.append("]");
456+
if (trackRealMemoryUsage) {
457+
final long realUsage = memoryUsed.baseUsage;
458+
message.append(", real usage: [");
459+
appendBytesSafe(message, realUsage);
460+
message.append("], new bytes reserved: [");
461+
appendBytesSafe(message, newBytesReserved);
462+
message.append("]");
463+
}
464+
message.append(", usages [");
465+
breakers.forEach(new BiConsumer<String, CircuitBreaker>() {
466+
private boolean first = true;
467+
468+
@Override
469+
public void accept(String key, CircuitBreaker breaker) {
470+
if (first) {
471+
first = false;
472+
} else {
473+
message.append(", ");
474+
}
475+
message.append(key).append("=");
476+
appendBytesSafe(message, (long) (breaker.getUsed() * breaker.getOverhead()));
477+
}
478+
});
479+
message.append("]");
480+
return message.toString();
481+
}
482+
483+
static void appendBytesSafe(StringBuilder stringBuilder, long bytes) {
484+
stringBuilder.append(bytes);
485+
if (-1L <= bytes) {
486+
stringBuilder.append("/");
487+
stringBuilder.append(new ByteSizeValue(bytes));
488+
} else {
489+
// Something's definitely wrong, maybe a breaker was freed twice? Still, we're just creating an exception message here, so we
490+
// should keep going if we're running in production.
491+
logger.error("negative value in circuit breaker: {}", stringBuilder);
492+
assert permitNegativeValues : stringBuilder.toString();
461493
}
462494
}
463495

@@ -656,4 +688,7 @@ TimeValue getLockTimeout() {
656688
return lockTimeout;
657689
}
658690
}
691+
692+
// exposed for testing
693+
static boolean permitNegativeValues = false;
659694
}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0 and the Server Side Public License, v 1; you may not use this file except
5+
* in compliance with, at your election, the Elastic License 2.0 or the Server
6+
* Side Public License, v 1.
7+
*/
8+
9+
package org.elasticsearch.indices.breaker;
10+
11+
import org.elasticsearch.common.Strings;
12+
import org.elasticsearch.test.ESTestCase;
13+
import org.elasticsearch.xcontent.ToXContentObject;
14+
15+
import static org.hamcrest.Matchers.equalTo;
16+
17+
public class CircuitBreakerStatsTests extends ESTestCase {
18+
19+
public void testStringRepresentations() {
20+
final CircuitBreakerStats circuitBreakerStats = new CircuitBreakerStats("t", 1L, 2L, 1.0, 3L);
21+
assertThat(circuitBreakerStats.toString(), equalTo("[t,limit=1/1b,estimated=2/2b,overhead=1.0,tripped=3]"));
22+
assertThat(
23+
toJson(circuitBreakerStats),
24+
equalTo(
25+
"{\"t\":{\"limit_size_in_bytes\":1,\"limit_size\":\"1b\","
26+
+ "\"estimated_size_in_bytes\":2,\"estimated_size\":\"2b\","
27+
+ "\"overhead\":1.0,\"tripped\":3}}"
28+
)
29+
);
30+
}
31+
32+
public void testStringRepresentationPermitsNegativeOne() {
33+
final CircuitBreakerStats circuitBreakerStats = new CircuitBreakerStats("t", -1L, -1L, 1.0, 3L);
34+
assertThat(circuitBreakerStats.toString(), equalTo("[t,limit=-1/-1b,estimated=-1/-1b,overhead=1.0,tripped=3]"));
35+
assertThat(
36+
toJson(circuitBreakerStats),
37+
equalTo(
38+
"{\"t\":{\"limit_size_in_bytes\":-1,\"limit_size\":\"-1b\","
39+
+ "\"estimated_size_in_bytes\":-1,\"estimated_size\":\"-1b\","
40+
+ "\"overhead\":1.0,\"tripped\":3}}"
41+
)
42+
);
43+
}
44+
45+
public void testStringRepresentationsWithNegativeStats() {
46+
try {
47+
HierarchyCircuitBreakerService.permitNegativeValues = true;
48+
final CircuitBreakerStats circuitBreakerStats = new CircuitBreakerStats("t", -2L, -3L, 1.0, 3L);
49+
assertThat(circuitBreakerStats.toString(), equalTo("[t,limit=-2,estimated=-3,overhead=1.0,tripped=3]"));
50+
assertThat(
51+
toJson(circuitBreakerStats),
52+
equalTo(
53+
"{\"t\":{\"limit_size_in_bytes\":-2,\"limit_size\":\"\","
54+
+ "\"estimated_size_in_bytes\":-3,\"estimated_size\":\"\","
55+
+ "\"overhead\":1.0,\"tripped\":3}}"
56+
)
57+
);
58+
} finally {
59+
HierarchyCircuitBreakerService.permitNegativeValues = false;
60+
}
61+
}
62+
63+
private static String toJson(CircuitBreakerStats circuitBreakerStats) {
64+
return Strings.toString((ToXContentObject) (builder, params) -> {
65+
builder.startObject();
66+
circuitBreakerStats.toXContent(builder, params);
67+
builder.endObject();
68+
return builder;
69+
}, false, true);
70+
}
71+
}

server/src/test/java/org/elasticsearch/indices/breaker/HierarchyCircuitBreakerServiceTests.java

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import org.elasticsearch.common.breaker.ChildMemoryCircuitBreaker;
1212
import org.elasticsearch.common.breaker.CircuitBreaker;
1313
import org.elasticsearch.common.breaker.CircuitBreakingException;
14+
import org.elasticsearch.common.breaker.NoopCircuitBreaker;
1415
import org.elasticsearch.common.settings.ClusterSettings;
1516
import org.elasticsearch.common.settings.Settings;
1617
import org.elasticsearch.common.unit.ByteSizeUnit;
@@ -46,6 +47,7 @@
4647
import static org.hamcrest.Matchers.lessThanOrEqualTo;
4748
import static org.hamcrest.Matchers.not;
4849
import static org.hamcrest.Matchers.nullValue;
50+
import static org.hamcrest.Matchers.oneOf;
4951
import static org.hamcrest.Matchers.sameInstance;
5052

5153
public class HierarchyCircuitBreakerServiceTests extends ESTestCase {
@@ -817,4 +819,77 @@ public void testCustomCircuitBreakers() {
817819
private static long mb(long size) {
818820
return new ByteSizeValue(size, ByteSizeUnit.MB).getBytes();
819821
}
822+
823+
public void testBuildParentTripMessage() {
824+
class TestChildCircuitBreaker extends NoopCircuitBreaker {
825+
private final long used;
826+
827+
TestChildCircuitBreaker(long used) {
828+
super("child");
829+
this.used = used;
830+
}
831+
832+
@Override
833+
public long getUsed() {
834+
return used;
835+
}
836+
837+
@Override
838+
public double getOverhead() {
839+
return 1.0;
840+
}
841+
}
842+
843+
assertThat(
844+
HierarchyCircuitBreakerService.buildParentTripMessage(
845+
1L,
846+
"test",
847+
new HierarchyCircuitBreakerService.MemoryUsage(2L, 3L, 4L, 5L),
848+
6L,
849+
false,
850+
org.elasticsearch.core.Map.of("child", new TestChildCircuitBreaker(7L), "otherChild", new TestChildCircuitBreaker(8L))
851+
),
852+
oneOf(
853+
"[parent] Data too large, data for [test] would be [3/3b], which is larger than the limit of [6/6b], "
854+
+ "usages [child=7/7b, otherChild=8/8b]",
855+
"[parent] Data too large, data for [test] would be [3/3b], which is larger than the limit of [6/6b], "
856+
+ "usages [otherChild=8/8b, child=7/7b]"
857+
)
858+
);
859+
860+
assertThat(
861+
HierarchyCircuitBreakerService.buildParentTripMessage(
862+
1L,
863+
"test",
864+
new HierarchyCircuitBreakerService.MemoryUsage(2L, 3L, 4L, 5L),
865+
6L,
866+
true,
867+
org.elasticsearch.core.Map.of()
868+
),
869+
equalTo(
870+
"[parent] Data too large, data for [test] would be [3/3b], which is larger than the limit of [6/6b], "
871+
+ "real usage: [2/2b], new bytes reserved: [1/1b], usages []"
872+
)
873+
);
874+
875+
try {
876+
HierarchyCircuitBreakerService.permitNegativeValues = true;
877+
assertThat(
878+
HierarchyCircuitBreakerService.buildParentTripMessage(
879+
-1L,
880+
"test",
881+
new HierarchyCircuitBreakerService.MemoryUsage(-2L, -3L, -4L, -5L),
882+
-6L,
883+
true,
884+
org.elasticsearch.core.Map.of("child1", new TestChildCircuitBreaker(-7L))
885+
),
886+
equalTo(
887+
"[parent] Data too large, data for [test] would be [-3], which is larger than the limit of [-6], "
888+
+ "real usage: [-2], new bytes reserved: [-1/-1b], usages [child1=-7]"
889+
)
890+
);
891+
} finally {
892+
HierarchyCircuitBreakerService.permitNegativeValues = false;
893+
}
894+
}
820895
}

0 commit comments

Comments
 (0)