Skip to content
5 changes: 5 additions & 0 deletions docs/changelog/132497.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 132497
summary: Add cache miss and read metrics
area: Searchable Snapshots
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@
import org.elasticsearch.index.store.LuceneFilesExtensions;
import org.elasticsearch.telemetry.TelemetryProvider;
import org.elasticsearch.telemetry.metric.DoubleHistogram;
import org.elasticsearch.telemetry.metric.DoubleWithAttributes;
import org.elasticsearch.telemetry.metric.LongCounter;
import org.elasticsearch.telemetry.metric.LongHistogram;
import org.elasticsearch.telemetry.metric.LongWithAttributes;
import org.elasticsearch.telemetry.metric.MeterRegistry;

import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.LongAdder;

public class BlobCacheMetrics {
private static final Logger logger = LogManager.getLogger(BlobCacheMetrics.class);
Expand All @@ -37,6 +40,9 @@ public class BlobCacheMetrics {
private final LongCounter cachePopulationBytes;
private final LongCounter cachePopulationTime;

private final LongAdder missCount = new LongAdder();
private final LongAdder readCount = new LongAdder();

public enum CachePopulationReason {
/**
* When warming the cache
Expand Down Expand Up @@ -94,6 +100,31 @@ public BlobCacheMetrics(MeterRegistry meterRegistry) {
"milliseconds"
)
);

meterRegistry.registerLongGauge(
"es.blob_cache.read.total",
"The number of cache reads (warming not included)",
"count",
() -> new LongWithAttributes(readCount.longValue())
);
// notice that this is different from `miss_that_triggered_read` in that `miss_that_triggered_read` will count once per gap
// filled for a single read. Whereas this one only counts whenever a read provoked populating data from the object store, though
// once per region for multi-region reads. This allows reasoning about hit ratio too.
meterRegistry.registerLongGauge(
"es.blob_cache.miss.total",
"The number of cache misses (warming not included)",
"count",
() -> new LongWithAttributes(missCount.longValue())
);
// adding this helps search for high or low miss ratio. It will be since boot of the node though. More advanced queries can use
// deltas of the totals to see miss ratio over time.
meterRegistry.registerDoubleGauge(
"es.blob_cache.miss.ratio",
"The fraction of cache reads that missed data (warming not included)",
"fraction",
// read misses before reads on purpose
() -> new DoubleWithAttributes(Math.min((double) missCount.longValue() / Math.max(readCount.longValue(), 1L), 1.0d))
);
}

BlobCacheMetrics(
Expand Down Expand Up @@ -170,6 +201,22 @@ public void recordCachePopulationMetrics(
}
}

public void recordRead() {
readCount.increment();
}

public void recordMiss() {
missCount.increment();
}

public long readCount() {
return readCount.sum();
}

public long missCount() {
return missCount.sum();
}

/**
* Calculate throughput as MiB/second
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,6 @@ private CacheEntry(T chunk) {
private final LongAdder writeCount = new LongAdder();
private final LongAdder writeBytes = new LongAdder();

private final LongAdder readCount = new LongAdder();
private final LongAdder readBytes = new LongAdder();

private final LongAdder evictCount = new LongAdder();
Expand Down Expand Up @@ -741,8 +740,9 @@ public Stats getStats() {
evictCount.sum(),
writeCount.sum(),
writeBytes.sum(),
readCount.sum(),
readBytes.sum()
blobCacheMetrics.readCount(),
readBytes.sum(),
blobCacheMetrics.missCount()
);
}

Expand Down Expand Up @@ -1113,7 +1113,7 @@ void populateAndRead(
+ '-'
+ rangeToRead.start()
+ ']';
blobCacheService.readCount.increment();
blobCacheService.blobCacheMetrics.recordRead();
l.onResponse(read);
})
);
Expand Down Expand Up @@ -1228,19 +1228,25 @@ public boolean tryRead(ByteBuffer buf, long offset) throws IOException {
return false;
}
var fileRegion = lastAccessedRegion;
boolean incrementReads = false;
if (fileRegion != null && fileRegion.chunk.regionKey.region == startRegion) {
// existing item, check if we need to promote item
fileRegion.touch();

} else {
fileRegion = cache.get(cacheKey, length, startRegion);
incrementReads = true;
}
final var region = fileRegion.chunk;
if (region.tracker.checkAvailable(end - getRegionStart(startRegion)) == false) {
return false;
}
boolean res = region.tryRead(buf, offset);
lastAccessedRegion = res ? fileRegion : null;
if (res && incrementReads) {
blobCacheMetrics.recordRead();
// todo: should we add to readBytes? readBytes.add(end - offset);
}
return res;
}

Expand Down Expand Up @@ -1309,7 +1315,7 @@ private int readSingleRegion(
mapSubRangeToRegion(rangeToWrite, region),
mapSubRangeToRegion(rangeToRead, region),
readerWithOffset(reader, fileRegion, Math.toIntExact(rangeToRead.start() - regionStart)),
writerWithOffset(writer, fileRegion, Math.toIntExact(rangeToWrite.start() - regionStart)),
metricRecordingWriter(writerWithOffset(writer, fileRegion, Math.toIntExact(rangeToWrite.start() - regionStart))),
ioExecutor,
readFuture
);
Expand Down Expand Up @@ -1341,7 +1347,9 @@ private int readMultiRegions(
mapSubRangeToRegion(rangeToWrite, region),
subRangeToRead,
readerWithOffset(reader, fileRegion, Math.toIntExact(rangeToRead.start() - regionStart)),
writerWithOffset(writer, fileRegion, Math.toIntExact(rangeToWrite.start() - regionStart)),
metricRecordingWriter(
writerWithOffset(writer, fileRegion, Math.toIntExact(rangeToWrite.start() - regionStart))
),
ioExecutor,
listener
);
Expand Down Expand Up @@ -1416,6 +1424,16 @@ public void fillCacheRange(
return adjustedWriter;
}

private RangeMissingHandler metricRecordingWriter(RangeMissingHandler writer) {
return new DelegatingRangeMissingHandler(writer) {
@Override
public SourceInputStreamFactory sharedInputStreamFactory(List<SparseFileTracker.Gap> gaps) {
blobCacheMetrics.recordMiss();
return super.sharedInputStreamFactory(gaps);
}
};
}

private RangeAvailableHandler readerWithOffset(RangeAvailableHandler reader, CacheFileRegion<KeyType> fileRegion, int readOffset) {
final RangeAvailableHandler adjustedReader = (channel, channelPos, relativePos, len) -> reader.onRangeAvailable(
channel,
Expand Down Expand Up @@ -1558,9 +1576,11 @@ public record Stats(
long writeCount,
long writeBytes,
long readCount,
long readBytes
long readBytes,
// miss-count not exposed in REST API for now
long missCount
) {
public static final Stats EMPTY = new Stats(0, 0L, 0L, 0L, 0L, 0L, 0L, 0L);
public static final Stats EMPTY = new Stats(0, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L);
}

private class LFUCache implements Cache<KeyType, CacheFileRegion<KeyType>> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import java.util.Arrays;
import java.util.concurrent.TimeUnit;
import java.util.stream.IntStream;

import static org.hamcrest.Matchers.is;

Expand Down Expand Up @@ -66,6 +67,40 @@ public void testRecordCachePopulationMetricsRecordsThroughput() {
.get(0);
assertEquals(totalTimeMeasurement.getLong(), TimeUnit.SECONDS.toMillis(secondsTaken));
assertExpectedAttributesPresent(totalTimeMeasurement, cachePopulationReason, cachePopulationSource, fileExtension);

// let us check for 0, avoid div by 0.
checkReadsAndMisses(0, 0, 1);
int reads = between(1, 100);
int misses = between(1, reads);
recordMisses(metrics, misses);
checkReadsAndMisses(0, misses, misses);
IntStream.range(0, reads).forEach(i -> metrics.recordRead());
checkReadsAndMisses(reads, misses, reads);
recordMisses(metrics, reads);
checkReadsAndMisses(reads, misses + reads, misses + reads);
}

private void recordMisses(BlobCacheMetrics metrics, int misses) {
IntStream.range(0, misses).forEach(i -> metrics.recordMiss());
}

private void checkReadsAndMisses(int reads, int writes, int readsForRatio) {
recordingMeterRegistry.getRecorder().collect();

Measurement totalReadsMeasurement = recordingMeterRegistry.getRecorder()
.getMeasurements(InstrumentType.LONG_GAUGE, "es.blob_cache.read.total")
.getLast();
assertEquals(reads, totalReadsMeasurement.getLong());

Measurement totalMissesMeasurement = recordingMeterRegistry.getRecorder()
.getMeasurements(InstrumentType.LONG_GAUGE, "es.blob_cache.miss.total")
.getLast();
assertEquals(writes, totalMissesMeasurement.getLong());

Measurement missRatio = recordingMeterRegistry.getRecorder()
.getMeasurements(InstrumentType.DOUBLE_GAUGE, "es.blob_cache.miss.ratio")
.getLast();
assertEquals((double) writes / readsForRatio, missRatio.getDouble(), 0.00000001d);
}

private static void assertExpectedAttributesPresent(
Expand Down