Skip to content

Commit 444c5f2

Browse files
Add cache miss and read metrics (#132497)
Fix up the reads metric to also consider tryRead accessing a new region as a read/hit. Add cache miss metric that only increments once per region per read. Add a miss ratio as well for easy querying.
1 parent 9a1f688 commit 444c5f2

File tree

4 files changed

+115
-8
lines changed

4 files changed

+115
-8
lines changed

docs/changelog/132497.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 132497
2+
summary: Add cache miss and read metrics
3+
area: Searchable Snapshots
4+
type: enhancement
5+
issues: []

x-pack/plugin/blob-cache/src/main/java/org/elasticsearch/blobcache/BlobCacheMetrics.java

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,15 @@
1212
import org.elasticsearch.index.store.LuceneFilesExtensions;
1313
import org.elasticsearch.telemetry.TelemetryProvider;
1414
import org.elasticsearch.telemetry.metric.DoubleHistogram;
15+
import org.elasticsearch.telemetry.metric.DoubleWithAttributes;
1516
import org.elasticsearch.telemetry.metric.LongCounter;
1617
import org.elasticsearch.telemetry.metric.LongHistogram;
18+
import org.elasticsearch.telemetry.metric.LongWithAttributes;
1719
import org.elasticsearch.telemetry.metric.MeterRegistry;
1820

1921
import java.util.Map;
2022
import java.util.concurrent.TimeUnit;
23+
import java.util.concurrent.atomic.LongAdder;
2124

2225
public class BlobCacheMetrics {
2326
private static final Logger logger = LogManager.getLogger(BlobCacheMetrics.class);
@@ -37,6 +40,9 @@ public class BlobCacheMetrics {
3740
private final LongCounter cachePopulationBytes;
3841
private final LongCounter cachePopulationTime;
3942

43+
private final LongAdder missCount = new LongAdder();
44+
private final LongAdder readCount = new LongAdder();
45+
4046
public enum CachePopulationReason {
4147
/**
4248
* When warming the cache
@@ -94,6 +100,31 @@ public BlobCacheMetrics(MeterRegistry meterRegistry) {
94100
"milliseconds"
95101
)
96102
);
103+
104+
meterRegistry.registerLongGauge(
105+
"es.blob_cache.read.total",
106+
"The number of cache reads (warming not included)",
107+
"count",
108+
() -> new LongWithAttributes(readCount.longValue())
109+
);
110+
// notice that this is different from `miss_that_triggered_read` in that `miss_that_triggered_read` will count once per gap
111+
// filled for a single read. Whereas this one only counts whenever a read provoked populating data from the object store, though
112+
// once per region for multi-region reads. This allows reasoning about hit ratio too.
113+
meterRegistry.registerLongGauge(
114+
"es.blob_cache.miss.total",
115+
"The number of cache misses (warming not included)",
116+
"count",
117+
() -> new LongWithAttributes(missCount.longValue())
118+
);
119+
// adding this helps search for high or low miss ratio. It will be since boot of the node though. More advanced queries can use
120+
// deltas of the totals to see miss ratio over time.
121+
meterRegistry.registerDoubleGauge(
122+
"es.blob_cache.miss.ratio",
123+
"The fraction of cache reads that missed data (warming not included)",
124+
"fraction",
125+
// read misses before reads on purpose
126+
() -> new DoubleWithAttributes(Math.min((double) missCount.longValue() / Math.max(readCount.longValue(), 1L), 1.0d))
127+
);
97128
}
98129

99130
BlobCacheMetrics(
@@ -170,6 +201,22 @@ public void recordCachePopulationMetrics(
170201
}
171202
}
172203

204+
public void recordRead() {
205+
readCount.increment();
206+
}
207+
208+
public void recordMiss() {
209+
missCount.increment();
210+
}
211+
212+
public long readCount() {
213+
return readCount.sum();
214+
}
215+
216+
public long missCount() {
217+
return missCount.sum();
218+
}
219+
173220
/**
174221
* Calculate throughput as MiB/second
175222
*

x-pack/plugin/blob-cache/src/main/java/org/elasticsearch/blobcache/shared/SharedBlobCacheService.java

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,6 @@ private CacheEntry(T chunk) {
328328
private final LongAdder writeCount = new LongAdder();
329329
private final LongAdder writeBytes = new LongAdder();
330330

331-
private final LongAdder readCount = new LongAdder();
332331
private final LongAdder readBytes = new LongAdder();
333332

334333
private final LongAdder evictCount = new LongAdder();
@@ -741,8 +740,9 @@ public Stats getStats() {
741740
evictCount.sum(),
742741
writeCount.sum(),
743742
writeBytes.sum(),
744-
readCount.sum(),
745-
readBytes.sum()
743+
blobCacheMetrics.readCount(),
744+
readBytes.sum(),
745+
blobCacheMetrics.missCount()
746746
);
747747
}
748748

@@ -1113,7 +1113,7 @@ void populateAndRead(
11131113
+ '-'
11141114
+ rangeToRead.start()
11151115
+ ']';
1116-
blobCacheService.readCount.increment();
1116+
blobCacheService.blobCacheMetrics.recordRead();
11171117
l.onResponse(read);
11181118
})
11191119
);
@@ -1228,19 +1228,25 @@ public boolean tryRead(ByteBuffer buf, long offset) throws IOException {
12281228
return false;
12291229
}
12301230
var fileRegion = lastAccessedRegion;
1231+
boolean incrementReads = false;
12311232
if (fileRegion != null && fileRegion.chunk.regionKey.region == startRegion) {
12321233
// existing item, check if we need to promote item
12331234
fileRegion.touch();
12341235

12351236
} else {
12361237
fileRegion = cache.get(cacheKey, length, startRegion);
1238+
incrementReads = true;
12371239
}
12381240
final var region = fileRegion.chunk;
12391241
if (region.tracker.checkAvailable(end - getRegionStart(startRegion)) == false) {
12401242
return false;
12411243
}
12421244
boolean res = region.tryRead(buf, offset);
12431245
lastAccessedRegion = res ? fileRegion : null;
1246+
if (res && incrementReads) {
1247+
blobCacheMetrics.recordRead();
1248+
// todo: should we add to readBytes? readBytes.add(end - offset);
1249+
}
12441250
return res;
12451251
}
12461252

@@ -1309,7 +1315,7 @@ private int readSingleRegion(
13091315
mapSubRangeToRegion(rangeToWrite, region),
13101316
mapSubRangeToRegion(rangeToRead, region),
13111317
readerWithOffset(reader, fileRegion, Math.toIntExact(rangeToRead.start() - regionStart)),
1312-
writerWithOffset(writer, fileRegion, Math.toIntExact(rangeToWrite.start() - regionStart)),
1318+
metricRecordingWriter(writerWithOffset(writer, fileRegion, Math.toIntExact(rangeToWrite.start() - regionStart))),
13131319
ioExecutor,
13141320
readFuture
13151321
);
@@ -1341,7 +1347,9 @@ private int readMultiRegions(
13411347
mapSubRangeToRegion(rangeToWrite, region),
13421348
subRangeToRead,
13431349
readerWithOffset(reader, fileRegion, Math.toIntExact(rangeToRead.start() - regionStart)),
1344-
writerWithOffset(writer, fileRegion, Math.toIntExact(rangeToWrite.start() - regionStart)),
1350+
metricRecordingWriter(
1351+
writerWithOffset(writer, fileRegion, Math.toIntExact(rangeToWrite.start() - regionStart))
1352+
),
13451353
ioExecutor,
13461354
listener
13471355
);
@@ -1416,6 +1424,16 @@ public void fillCacheRange(
14161424
return adjustedWriter;
14171425
}
14181426

1427+
private RangeMissingHandler metricRecordingWriter(RangeMissingHandler writer) {
1428+
return new DelegatingRangeMissingHandler(writer) {
1429+
@Override
1430+
public SourceInputStreamFactory sharedInputStreamFactory(List<SparseFileTracker.Gap> gaps) {
1431+
blobCacheMetrics.recordMiss();
1432+
return super.sharedInputStreamFactory(gaps);
1433+
}
1434+
};
1435+
}
1436+
14191437
private RangeAvailableHandler readerWithOffset(RangeAvailableHandler reader, CacheFileRegion<KeyType> fileRegion, int readOffset) {
14201438
final RangeAvailableHandler adjustedReader = (channel, channelPos, relativePos, len) -> reader.onRangeAvailable(
14211439
channel,
@@ -1558,9 +1576,11 @@ public record Stats(
15581576
long writeCount,
15591577
long writeBytes,
15601578
long readCount,
1561-
long readBytes
1579+
long readBytes,
1580+
// miss-count not exposed in REST API for now
1581+
long missCount
15621582
) {
1563-
public static final Stats EMPTY = new Stats(0, 0L, 0L, 0L, 0L, 0L, 0L, 0L);
1583+
public static final Stats EMPTY = new Stats(0, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L);
15641584
}
15651585

15661586
private class LFUCache implements Cache<KeyType, CacheFileRegion<KeyType>> {

x-pack/plugin/blob-cache/src/test/java/org/elasticsearch/blobcache/BlobCacheMetricsTests.java

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
import java.util.Arrays;
1919
import java.util.concurrent.TimeUnit;
20+
import java.util.stream.IntStream;
2021

2122
import static org.hamcrest.Matchers.is;
2223

@@ -66,6 +67,40 @@ public void testRecordCachePopulationMetricsRecordsThroughput() {
6667
.get(0);
6768
assertEquals(totalTimeMeasurement.getLong(), TimeUnit.SECONDS.toMillis(secondsTaken));
6869
assertExpectedAttributesPresent(totalTimeMeasurement, cachePopulationReason, cachePopulationSource, fileExtension);
70+
71+
// let us check for 0, avoid div by 0.
72+
checkReadsAndMisses(0, 0, 1);
73+
int reads = between(1, 100);
74+
int misses = between(1, reads);
75+
recordMisses(metrics, misses);
76+
checkReadsAndMisses(0, misses, misses);
77+
IntStream.range(0, reads).forEach(i -> metrics.recordRead());
78+
checkReadsAndMisses(reads, misses, reads);
79+
recordMisses(metrics, reads);
80+
checkReadsAndMisses(reads, misses + reads, misses + reads);
81+
}
82+
83+
private void recordMisses(BlobCacheMetrics metrics, int misses) {
84+
IntStream.range(0, misses).forEach(i -> metrics.recordMiss());
85+
}
86+
87+
private void checkReadsAndMisses(int reads, int writes, int readsForRatio) {
88+
recordingMeterRegistry.getRecorder().collect();
89+
90+
Measurement totalReadsMeasurement = recordingMeterRegistry.getRecorder()
91+
.getMeasurements(InstrumentType.LONG_GAUGE, "es.blob_cache.read.total")
92+
.getLast();
93+
assertEquals(reads, totalReadsMeasurement.getLong());
94+
95+
Measurement totalMissesMeasurement = recordingMeterRegistry.getRecorder()
96+
.getMeasurements(InstrumentType.LONG_GAUGE, "es.blob_cache.miss.total")
97+
.getLast();
98+
assertEquals(writes, totalMissesMeasurement.getLong());
99+
100+
Measurement missRatio = recordingMeterRegistry.getRecorder()
101+
.getMeasurements(InstrumentType.DOUBLE_GAUGE, "es.blob_cache.miss.ratio")
102+
.getLast();
103+
assertEquals((double) writes / readsForRatio, missRatio.getDouble(), 0.00000001d);
69104
}
70105

71106
private static void assertExpectedAttributesPresent(

0 commit comments

Comments
 (0)