Skip to content

Commit 5934190

Browse files
authored
Add additional BlobCacheMetrics, expose BlobCacheMetrics via SharedBlobCacheService (elastic#111730)
Relates: ES-9067
1 parent 15890e1 commit 5934190

File tree

4 files changed

+215
-1
lines changed

4 files changed

+215
-1
lines changed

x-pack/plugin/blob-cache/src/main/java/org/elasticsearch/blobcache/BlobCacheMetrics.java

Lines changed: 105 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,43 @@
77

88
package org.elasticsearch.blobcache;
99

10+
import org.apache.logging.log4j.LogManager;
11+
import org.apache.logging.log4j.Logger;
1012
import org.elasticsearch.telemetry.TelemetryProvider;
13+
import org.elasticsearch.telemetry.metric.DoubleHistogram;
1114
import org.elasticsearch.telemetry.metric.LongCounter;
1215
import org.elasticsearch.telemetry.metric.LongHistogram;
1316
import org.elasticsearch.telemetry.metric.MeterRegistry;
1417

18+
import java.util.Map;
19+
import java.util.concurrent.TimeUnit;
20+
1521
public class BlobCacheMetrics {
22+
private static final Logger logger = LogManager.getLogger(BlobCacheMetrics.class);
23+
24+
private static final double BYTES_PER_NANOSECONDS_TO_MEBIBYTES_PER_SECOND = 1e9D / (1 << 20);
25+
public static final String CACHE_POPULATION_REASON_ATTRIBUTE_KEY = "reason";
26+
public static final String CACHE_POPULATION_SOURCE_ATTRIBUTE_KEY = "source";
27+
public static final String SHARD_ID_ATTRIBUTE_KEY = "shard_id";
28+
public static final String INDEX_ATTRIBUTE_KEY = "index_name";
29+
1630
private final LongCounter cacheMissCounter;
1731
private final LongCounter evictedCountNonZeroFrequency;
1832
private final LongHistogram cacheMissLoadTimes;
33+
private final DoubleHistogram cachePopulationThroughput;
34+
private final LongCounter cachePopulationBytes;
35+
private final LongCounter cachePopulationTime;
36+
37+
public enum CachePopulationReason {
38+
/**
39+
* When warming the cache
40+
*/
41+
Warming,
42+
/**
43+
* When the data we need is not in the cache
44+
*/
45+
CacheMiss
46+
}
1947

2048
public BlobCacheMetrics(MeterRegistry meterRegistry) {
2149
this(
@@ -33,14 +61,39 @@ public BlobCacheMetrics(MeterRegistry meterRegistry) {
3361
"es.blob_cache.cache_miss_load_times.histogram",
3462
"The time in milliseconds for populating entries in the blob store resulting from a cache miss, expressed as a histogram.",
3563
"ms"
64+
),
65+
meterRegistry.registerDoubleHistogram(
66+
"es.blob_cache.population.throughput.histogram",
67+
"The throughput observed when populating the the cache",
68+
"MiB/second"
69+
),
70+
meterRegistry.registerLongCounter(
71+
"es.blob_cache.population.bytes.total",
72+
"The number of bytes that have been copied into the cache",
73+
"bytes"
74+
),
75+
meterRegistry.registerLongCounter(
76+
"es.blob_cache.population.time.total",
77+
"The time spent copying data into the cache",
78+
"milliseconds"
3679
)
3780
);
3881
}
3982

40-
BlobCacheMetrics(LongCounter cacheMissCounter, LongCounter evictedCountNonZeroFrequency, LongHistogram cacheMissLoadTimes) {
83+
BlobCacheMetrics(
84+
LongCounter cacheMissCounter,
85+
LongCounter evictedCountNonZeroFrequency,
86+
LongHistogram cacheMissLoadTimes,
87+
DoubleHistogram cachePopulationThroughput,
88+
LongCounter cachePopulationBytes,
89+
LongCounter cachePopulationTime
90+
) {
4191
this.cacheMissCounter = cacheMissCounter;
4292
this.evictedCountNonZeroFrequency = evictedCountNonZeroFrequency;
4393
this.cacheMissLoadTimes = cacheMissLoadTimes;
94+
this.cachePopulationThroughput = cachePopulationThroughput;
95+
this.cachePopulationBytes = cachePopulationBytes;
96+
this.cachePopulationTime = cachePopulationTime;
4497
}
4598

4699
public static BlobCacheMetrics NOOP = new BlobCacheMetrics(TelemetryProvider.NOOP.getMeterRegistry());
@@ -56,4 +109,55 @@ public LongCounter getEvictedCountNonZeroFrequency() {
56109
public LongHistogram getCacheMissLoadTimes() {
57110
return cacheMissLoadTimes;
58111
}
112+
113+
/**
114+
* Record the various cache population metrics after a chunk is copied to the cache
115+
*
116+
* @param bytesCopied The number of bytes copied
117+
* @param copyTimeNanos The time taken to copy the bytes in nanoseconds
118+
* @param index The index being loaded
119+
* @param shardId The ID of the shard being loaded
120+
* @param cachePopulationReason The reason for the cache being populated
121+
* @param cachePopulationSource The source from which the data is being loaded
122+
*/
123+
public void recordCachePopulationMetrics(
124+
int bytesCopied,
125+
long copyTimeNanos,
126+
String index,
127+
int shardId,
128+
CachePopulationReason cachePopulationReason,
129+
CachePopulationSource cachePopulationSource
130+
) {
131+
Map<String, Object> metricAttributes = Map.of(
132+
INDEX_ATTRIBUTE_KEY,
133+
index,
134+
SHARD_ID_ATTRIBUTE_KEY,
135+
shardId,
136+
CACHE_POPULATION_REASON_ATTRIBUTE_KEY,
137+
cachePopulationReason.name(),
138+
CACHE_POPULATION_SOURCE_ATTRIBUTE_KEY,
139+
cachePopulationSource.name()
140+
);
141+
assert bytesCopied > 0 : "We shouldn't be recording zero-sized copies";
142+
cachePopulationBytes.incrementBy(bytesCopied, metricAttributes);
143+
144+
// This is almost certainly paranoid, but if we had a very fast/small copy with a very coarse nanosecond timer it might happen?
145+
if (copyTimeNanos > 0) {
146+
cachePopulationThroughput.record(toMebibytesPerSecond(bytesCopied, copyTimeNanos), metricAttributes);
147+
cachePopulationTime.incrementBy(TimeUnit.NANOSECONDS.toMillis(copyTimeNanos), metricAttributes);
148+
} else {
149+
logger.warn("Zero-time copy being reported, ignoring");
150+
}
151+
}
152+
153+
/**
154+
* Calculate throughput as MiB/second
155+
*
156+
* @param numberOfBytes The number of bytes transferred
157+
* @param timeInNanoseconds The time taken to transfer in nanoseconds
158+
* @return The throughput as MiB/second
159+
*/
160+
private double toMebibytesPerSecond(int numberOfBytes, long timeInNanoseconds) {
161+
return ((double) numberOfBytes / timeInNanoseconds) * BYTES_PER_NANOSECONDS_TO_MEBIBYTES_PER_SECOND;
162+
}
59163
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0; you may not use this file except in compliance with the Elastic License
5+
* 2.0.
6+
*/
7+
8+
package org.elasticsearch.blobcache;
9+
10+
/**
11+
* The places we populate the cache from
12+
*/
13+
public enum CachePopulationSource {
14+
/**
15+
* When loading data from the blob-store
16+
*/
17+
BlobStore,
18+
/**
19+
* When fetching data from a peer node
20+
*/
21+
Peer,
22+
/**
23+
* We cannot determine the source (should not be used except in exceptional cases)
24+
*/
25+
Unknown
26+
}

x-pack/plugin/blob-cache/src/main/java/org/elasticsearch/blobcache/shared/SharedBlobCacheService.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,10 @@ public static long calculateCacheSize(Settings settings, long totalFsSize) {
398398
.getBytes();
399399
}
400400

401+
public BlobCacheMetrics getBlobCacheMetrics() {
402+
return blobCacheMetrics;
403+
}
404+
401405
public int getRangeSize() {
402406
return rangeSize;
403407
}
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0; you may not use this file except in compliance with the Elastic License
5+
* 2.0.
6+
*/
7+
8+
package org.elasticsearch.blobcache;
9+
10+
import org.elasticsearch.common.unit.ByteSizeValue;
11+
import org.elasticsearch.telemetry.InstrumentType;
12+
import org.elasticsearch.telemetry.Measurement;
13+
import org.elasticsearch.telemetry.RecordingMeterRegistry;
14+
import org.elasticsearch.test.ESTestCase;
15+
import org.junit.Before;
16+
17+
import java.util.concurrent.TimeUnit;
18+
19+
public class BlobCacheMetricsTests extends ESTestCase {
20+
21+
private RecordingMeterRegistry recordingMeterRegistry;
22+
private BlobCacheMetrics metrics;
23+
24+
@Before
25+
public void createMetrics() {
26+
recordingMeterRegistry = new RecordingMeterRegistry();
27+
metrics = new BlobCacheMetrics(recordingMeterRegistry);
28+
}
29+
30+
public void testRecordCachePopulationMetricsRecordsThroughput() {
31+
int mebiBytesSent = randomIntBetween(1, 4);
32+
int secondsTaken = randomIntBetween(1, 5);
33+
String indexName = randomIdentifier();
34+
int shardId = randomIntBetween(0, 10);
35+
BlobCacheMetrics.CachePopulationReason cachePopulationReason = randomFrom(BlobCacheMetrics.CachePopulationReason.values());
36+
CachePopulationSource cachePopulationSource = randomFrom(CachePopulationSource.values());
37+
metrics.recordCachePopulationMetrics(
38+
Math.toIntExact(ByteSizeValue.ofMb(mebiBytesSent).getBytes()),
39+
TimeUnit.SECONDS.toNanos(secondsTaken),
40+
indexName,
41+
shardId,
42+
cachePopulationReason,
43+
cachePopulationSource
44+
);
45+
46+
// throughput histogram
47+
Measurement throughputMeasurement = recordingMeterRegistry.getRecorder()
48+
.getMeasurements(InstrumentType.DOUBLE_HISTOGRAM, "es.blob_cache.population.throughput.histogram")
49+
.get(0);
50+
assertEquals(throughputMeasurement.getDouble(), (double) mebiBytesSent / secondsTaken, 0.0);
51+
assertExpectedAttributesPresent(throughputMeasurement, shardId, indexName, cachePopulationReason, cachePopulationSource);
52+
53+
// bytes counter
54+
Measurement totalBytesMeasurement = recordingMeterRegistry.getRecorder()
55+
.getMeasurements(InstrumentType.LONG_COUNTER, "es.blob_cache.population.bytes.total")
56+
.get(0);
57+
assertEquals(totalBytesMeasurement.getLong(), ByteSizeValue.ofMb(mebiBytesSent).getBytes());
58+
assertExpectedAttributesPresent(totalBytesMeasurement, shardId, indexName, cachePopulationReason, cachePopulationSource);
59+
60+
// time counter
61+
Measurement totalTimeMeasurement = recordingMeterRegistry.getRecorder()
62+
.getMeasurements(InstrumentType.LONG_COUNTER, "es.blob_cache.population.time.total")
63+
.get(0);
64+
assertEquals(totalTimeMeasurement.getLong(), TimeUnit.SECONDS.toMillis(secondsTaken));
65+
assertExpectedAttributesPresent(totalTimeMeasurement, shardId, indexName, cachePopulationReason, cachePopulationSource);
66+
}
67+
68+
private static void assertExpectedAttributesPresent(
69+
Measurement measurement,
70+
int shardId,
71+
String indexName,
72+
BlobCacheMetrics.CachePopulationReason cachePopulationReason,
73+
CachePopulationSource cachePopulationSource
74+
) {
75+
assertEquals(measurement.attributes().get(BlobCacheMetrics.SHARD_ID_ATTRIBUTE_KEY), shardId);
76+
assertEquals(measurement.attributes().get(BlobCacheMetrics.INDEX_ATTRIBUTE_KEY), indexName);
77+
assertEquals(measurement.attributes().get(BlobCacheMetrics.CACHE_POPULATION_REASON_ATTRIBUTE_KEY), cachePopulationReason.name());
78+
assertEquals(measurement.attributes().get(BlobCacheMetrics.CACHE_POPULATION_SOURCE_ATTRIBUTE_KEY), cachePopulationSource.name());
79+
}
80+
}

0 commit comments

Comments
 (0)