Skip to content

Commit b341645

Browse files
committed
Add snapshot metrics
1 parent 345cc59 commit b341645

File tree

14 files changed

+136
-63
lines changed

14 files changed

+136
-63
lines changed

server/src/internalClusterTest/java/org/elasticsearch/repositories/SnapshotMetricsIT.java

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import org.elasticsearch.action.admin.indices.stats.IndexStats;
1313
import org.elasticsearch.action.admin.indices.stats.IndicesStatsResponse;
1414
import org.elasticsearch.action.admin.indices.stats.ShardStats;
15+
import org.elasticsearch.common.settings.Settings;
1516
import org.elasticsearch.common.unit.ByteSizeValue;
1617
import org.elasticsearch.common.util.CollectionUtils;
1718
import org.elasticsearch.core.TimeValue;
@@ -20,6 +21,7 @@
2021
import org.elasticsearch.snapshots.AbstractSnapshotIntegTestCase;
2122
import org.elasticsearch.telemetry.Measurement;
2223
import org.elasticsearch.telemetry.TestTelemetryPlugin;
24+
import org.elasticsearch.test.ESIntegTestCase;
2325
import org.elasticsearch.threadpool.ThreadPool;
2426
import org.hamcrest.Matcher;
2527

@@ -28,6 +30,7 @@
2830
import java.util.stream.Stream;
2931
import java.util.stream.StreamSupport;
3032

33+
import static org.elasticsearch.threadpool.ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING;
3134
import static org.hamcrest.Matchers.allOf;
3235
import static org.hamcrest.Matchers.equalTo;
3336
import static org.hamcrest.Matchers.everyItem;
@@ -36,13 +39,23 @@
3639
import static org.hamcrest.Matchers.hasSize;
3740
import static org.hamcrest.Matchers.lessThan;
3841

42+
@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST)
3943
public class SnapshotMetricsIT extends AbstractSnapshotIntegTestCase {
4044

4145
@Override
4246
protected Collection<Class<? extends Plugin>> nodePlugins() {
4347
return CollectionUtils.appendToCopy(super.nodePlugins(), TestTelemetryPlugin.class);
4448
}
4549

50+
@Override
51+
protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
52+
return Settings.builder()
53+
.put(super.nodeSettings(nodeOrdinal, otherSettings))
54+
// Make sanity checking duration histograms possible
55+
.put(ESTIMATED_TIME_INTERVAL_SETTING.getKey(), "0s")
56+
.build();
57+
}
58+
4659
public void testSnapshotAPMMetrics() throws Exception {
4760
final String indexName = randomIdentifier();
4861
final int numShards = randomIntBetween(1, 10);
@@ -61,8 +74,8 @@ public void testSnapshotAPMMetrics() throws Exception {
6174

6275
final String repositoryName = randomIdentifier();
6376

64-
// we want to ensure some throttling, but not too much that it slows the test down. 5 seemed a reasonable multiple to ensure that.
65-
int shardSizeMultipleToEnsureThrottling = 5;
77+
// we want to ensure some throttling, but not too much that it slows the test down. 3 seemed a reasonable multiple to ensure that.
78+
int shardSizeMultipleToEnsureThrottling = 3;
6679
createRepository(
6780
repositoryName,
6881
"mock",
@@ -84,9 +97,10 @@ public void testSnapshotAPMMetrics() throws Exception {
8497

8598
waitForBlockOnAnyDataNode(repositoryName);
8699
collectMetrics();
100+
assertSnapshotsInProgressMetricIs(greaterThan(0L));
87101
assertShardsInProgressMetricIs(hasItem(greaterThan(0L)));
88-
assertThat(getTotalClusterLongCounterValue(SnapshotMetrics.SNAPSHOT_STARTED), equalTo(1L));
89-
assertThat(getTotalClusterLongCounterValue(SnapshotMetrics.SNAPSHOT_COMPLETED), equalTo(0L));
102+
assertThat(getTotalClusterLongCounterValue(SnapshotMetrics.SNAPSHOTS_STARTED), equalTo(1L));
103+
assertThat(getTotalClusterLongCounterValue(SnapshotMetrics.SNAPSHOTS_COMPLETED), equalTo(0L));
90104
assertThat(getTotalClusterLongCounterValue(SnapshotMetrics.SNAPSHOT_SHARDS_STARTED), greaterThan(0L));
91105
assertThat(getTotalClusterLongCounterValue(SnapshotMetrics.SNAPSHOT_SHARDS_COMPLETED), equalTo(0L));
92106
} finally {
@@ -103,8 +117,8 @@ public void testSnapshotAPMMetrics() throws Exception {
103117
assertThat(getTotalClusterLongCounterValue(SnapshotMetrics.SNAPSHOT_BYTES_UPLOADED), greaterThan(0L));
104118
assertThat(getTotalClusterLongCounterValue(SnapshotMetrics.SNAPSHOT_CREATE_THROTTLE_DURATION), greaterThan(0L));
105119
assertThat(getTotalClusterLongCounterValue(SnapshotMetrics.SNAPSHOT_RESTORE_THROTTLE_DURATION), equalTo(0L));
106-
assertThat(getTotalClusterLongCounterValue(SnapshotMetrics.SNAPSHOT_STARTED), equalTo(1L));
107-
assertThat(getTotalClusterLongCounterValue(SnapshotMetrics.SNAPSHOT_COMPLETED), equalTo(1L));
120+
assertThat(getTotalClusterLongCounterValue(SnapshotMetrics.SNAPSHOTS_STARTED), equalTo(1L));
121+
assertThat(getTotalClusterLongCounterValue(SnapshotMetrics.SNAPSHOTS_COMPLETED), equalTo(1L));
108122

109123
// Sanity check shard duration observations
110124
assertDoubleHistogramMetrics(SnapshotMetrics.SNAPSHOT_SHARDS_DURATION, hasSize(numShards));
@@ -140,6 +154,7 @@ public void testSnapshotAPMMetrics() throws Exception {
140154
assertThat(getTotalClusterLongCounterValue(SnapshotMetrics.SNAPSHOT_SHARDS_STARTED), equalTo((long) numShards));
141155
assertThat(getTotalClusterLongCounterValue(SnapshotMetrics.SNAPSHOT_SHARDS_COMPLETED), equalTo((long) numShards));
142156

157+
assertSnapshotsInProgressMetricIs(equalTo(0L));
143158
assertShardsInProgressMetricIs(everyItem(equalTo(0L)));
144159

145160
// Restore the snapshot
@@ -173,6 +188,20 @@ private static void assertShardsInProgressMetricIs(Matcher<? super List<Long>> m
173188
assertThat(values, matcher);
174189
}
175190

191+
private static void assertSnapshotsInProgressMetricIs(Matcher<Long> matcher) {
192+
final List<Long> values = internalCluster().getCurrentMasterNodeInstance(PluginsService.class)
193+
.filterPlugins(TestTelemetryPlugin.class)
194+
.map(testTelemetryPlugin -> {
195+
final List<Measurement> longGaugeMeasurement = testTelemetryPlugin.getLongGaugeMeasurement(
196+
SnapshotMetrics.SNAPSHOTS_IN_PROGRESS
197+
);
198+
return longGaugeMeasurement.getLast().getLong();
199+
})
200+
.toList();
201+
assertThat(values, hasSize(1));
202+
assertThat(values.getFirst(), matcher);
203+
}
204+
176205
private static void collectMetrics() {
177206
allTestTelemetryPlugins().forEach(TestTelemetryPlugin::collect);
178207
}

server/src/main/java/org/elasticsearch/node/NodeConstruction.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@
188188
import org.elasticsearch.readiness.ReadinessService;
189189
import org.elasticsearch.repositories.RepositoriesModule;
190190
import org.elasticsearch.repositories.RepositoriesService;
191+
import org.elasticsearch.repositories.SnapshotMetrics;
191192
import org.elasticsearch.reservedstate.ReservedClusterStateHandler;
192193
import org.elasticsearch.reservedstate.ReservedProjectStateHandler;
193194
import org.elasticsearch.reservedstate.ReservedStateHandlerProvider;
@@ -728,6 +729,7 @@ private void construct(
728729
BigArrays bigArrays = serviceProvider.newBigArrays(pluginsService, pageCacheRecycler, circuitBreakerService);
729730

730731
final RecoverySettings recoverySettings = new RecoverySettings(settings, settingsModule.getClusterSettings());
732+
SnapshotMetrics snapshotMetrics = new SnapshotMetrics(telemetryProvider.getMeterRegistry());
731733
RepositoriesModule repositoriesModule = new RepositoriesModule(
732734
environment,
733735
pluginsService.filterPlugins(RepositoryPlugin.class).toList(),
@@ -737,7 +739,8 @@ private void construct(
737739
bigArrays,
738740
xContentRegistry,
739741
recoverySettings,
740-
telemetryProvider
742+
telemetryProvider,
743+
snapshotMetrics
741744
);
742745
RepositoriesService repositoriesService = repositoriesModule.getRepositoryService();
743746
final SetOnce<RerouteService> rerouteServiceReference = new SetOnce<>();
@@ -1115,7 +1118,8 @@ public Map<String, String> queryFields() {
11151118
repositoriesService,
11161119
transportService,
11171120
actionModule.getActionFilters(),
1118-
systemIndices
1121+
systemIndices,
1122+
snapshotMetrics
11191123
);
11201124
SnapshotShardsService snapshotShardsService = new SnapshotShardsService(
11211125
settings,

server/src/main/java/org/elasticsearch/repositories/RepositoriesModule.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ public RepositoriesModule(
5050
BigArrays bigArrays,
5151
NamedXContentRegistry namedXContentRegistry,
5252
RecoverySettings recoverySettings,
53-
TelemetryProvider telemetryProvider
53+
TelemetryProvider telemetryProvider,
54+
SnapshotMetrics snapshotMetrics
5455
) {
5556
final RepositoriesMetrics repositoriesMetrics = new RepositoriesMetrics(telemetryProvider.getMeterRegistry());
5657
Map<String, Repository.Factory> factories = new HashMap<>();
@@ -142,7 +143,7 @@ public Repository create(ProjectId projectId, RepositoryMetadata metadata, Snaps
142143
threadPool,
143144
client,
144145
preRestoreChecks,
145-
telemetryProvider.getMeterRegistry()
146+
snapshotMetrics
146147
);
147148
}
148149

server/src/main/java/org/elasticsearch/repositories/RepositoriesService.java

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@
5959
import org.elasticsearch.repositories.blobstore.MeteredBlobStoreRepository;
6060
import org.elasticsearch.snapshots.Snapshot;
6161
import org.elasticsearch.telemetry.metric.LongWithAttributes;
62-
import org.elasticsearch.telemetry.metric.MeterRegistry;
6362
import org.elasticsearch.threadpool.ThreadPool;
6463

6564
import java.io.IOException;
@@ -136,7 +135,7 @@ public RepositoriesService(
136135
ThreadPool threadPool,
137136
NodeClient client,
138137
List<BiConsumer<Snapshot, IndexVersion>> preRestoreChecks,
139-
MeterRegistry meterRegistry
138+
SnapshotMetrics snapshotMetrics
140139
) {
141140
this.typesRegistry = typesRegistry;
142141
this.internalTypesRegistry = internalTypesRegistry;
@@ -156,7 +155,8 @@ public RepositoriesService(
156155
threadPool.relativeTimeInMillisSupplier()
157156
);
158157
this.preRestoreChecks = preRestoreChecks;
159-
this.snapshotMetrics = new SnapshotMetrics(meterRegistry, this::getShardSnapshotsInProgress);
158+
this.snapshotMetrics = snapshotMetrics;
159+
snapshotMetrics.createSnapshotShardsInProgressMetric(this::getShardSnapshotsInProgress);
160160
}
161161

162162
/**
@@ -1269,10 +1269,6 @@ public RepositoryUsageStats getUsageStats() {
12691269
);
12701270
}
12711271

1272-
public SnapshotMetrics getSnapshotMetrics() {
1273-
return snapshotMetrics;
1274-
}
1275-
12761272
@Override
12771273
protected void doStart() {}
12781274

server/src/main/java/org/elasticsearch/repositories/SnapshotMetrics.java

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,14 @@
99

1010
package org.elasticsearch.repositories;
1111

12+
import org.elasticsearch.cluster.metadata.ProjectId;
1213
import org.elasticsearch.cluster.metadata.RepositoryMetadata;
1314
import org.elasticsearch.telemetry.metric.DoubleHistogram;
1415
import org.elasticsearch.telemetry.metric.LongCounter;
15-
import org.elasticsearch.telemetry.metric.LongGauge;
1616
import org.elasticsearch.telemetry.metric.LongWithAttributes;
1717
import org.elasticsearch.telemetry.metric.MeterRegistry;
1818

1919
import java.util.Collection;
20-
import java.util.List;
2120
import java.util.Map;
2221
import java.util.function.Supplier;
2322

@@ -27,20 +26,21 @@ public record SnapshotMetrics(
2726
DoubleHistogram snapshotsDurationHistogram,
2827
LongCounter snapshotsShardsStartedCounter,
2928
LongCounter snapshotsShardsCompletedCounter,
30-
LongGauge snapshotShardsInProgressGauge,
3129
DoubleHistogram snapshotShardsDurationHistogram,
3230
LongCounter snapshotBlobsUploadedCounter,
3331
LongCounter snapshotBytesUploadedCounter,
3432
LongCounter snapshotUploadDurationCounter,
3533
LongCounter snapshotUploadReadDurationCounter,
3634
LongCounter snapshotCreateThrottleDurationCounter,
37-
LongCounter snapshotRestoreThrottleDurationCounter
35+
LongCounter snapshotRestoreThrottleDurationCounter,
36+
MeterRegistry meterRegistry
3837
) {
3938

40-
public static final SnapshotMetrics NOOP = new SnapshotMetrics(MeterRegistry.NOOP, List::of);
39+
public static final SnapshotMetrics NOOP = new SnapshotMetrics(MeterRegistry.NOOP);
4140

42-
public static final String SNAPSHOT_STARTED = "es.repositories.snapshots.started.total";
43-
public static final String SNAPSHOT_COMPLETED = "es.repositories.snapshots.completed.total";
41+
public static final String SNAPSHOTS_STARTED = "es.repositories.snapshots.started.total";
42+
public static final String SNAPSHOTS_COMPLETED = "es.repositories.snapshots.completed.total";
43+
public static final String SNAPSHOTS_IN_PROGRESS = "es.repositories.snapshots.current";
4444
public static final String SNAPSHOT_DURATION = "es.repositories.snapshots.duration.histogram";
4545
public static final String SNAPSHOT_SHARDS_STARTED = "es.repositories.snapshots.shards.started.total";
4646
public static final String SNAPSHOT_SHARDS_COMPLETED = "es.repositories.snapshots.shards.completed.total";
@@ -53,30 +53,38 @@ public record SnapshotMetrics(
5353
public static final String SNAPSHOT_CREATE_THROTTLE_DURATION = "es.repositories.snapshots.create_throttling.time.total";
5454
public static final String SNAPSHOT_RESTORE_THROTTLE_DURATION = "es.repositories.snapshots.restore_throttling.time.total";
5555

56-
public SnapshotMetrics(MeterRegistry meterRegistry, Supplier<Collection<LongWithAttributes>> shardSnapshotsInProgressObserver) {
56+
public SnapshotMetrics(MeterRegistry meterRegistry) {
5757
this(
58-
meterRegistry.registerLongCounter(SNAPSHOT_STARTED, "snapshots started", "unit"),
59-
meterRegistry.registerLongCounter(SNAPSHOT_COMPLETED, "snapshots completed", "unit"),
58+
meterRegistry.registerLongCounter(SNAPSHOTS_STARTED, "snapshots started", "unit"),
59+
meterRegistry.registerLongCounter(SNAPSHOTS_COMPLETED, "snapshots completed", "unit"),
6060
meterRegistry.registerDoubleHistogram(SNAPSHOT_DURATION, "snapshots duration", "s"),
6161
meterRegistry.registerLongCounter(SNAPSHOT_SHARDS_STARTED, "shard snapshots started", "unit"),
6262
meterRegistry.registerLongCounter(SNAPSHOT_SHARDS_COMPLETED, "shard snapshots completed", "unit"),
63-
meterRegistry.registerLongsGauge(
64-
SNAPSHOT_SHARDS_IN_PROGRESS,
65-
"shard snapshots in progress",
66-
"unit",
67-
shardSnapshotsInProgressObserver
68-
),
6963
meterRegistry.registerDoubleHistogram(SNAPSHOT_SHARDS_DURATION, "shard snapshots duration", "s"),
7064
meterRegistry.registerLongCounter(SNAPSHOT_BLOBS_UPLOADED, "snapshot blobs uploaded", "unit"),
7165
meterRegistry.registerLongCounter(SNAPSHOT_BYTES_UPLOADED, "snapshot bytes uploaded", "bytes"),
7266
meterRegistry.registerLongCounter(SNAPSHOT_UPLOAD_DURATION, "snapshot upload duration", "ns"),
7367
meterRegistry.registerLongCounter(SNAPSHOT_UPLOAD_READ_DURATION, "time spent in read() calls when snapshotting", "ns"),
7468
meterRegistry.registerLongCounter(SNAPSHOT_CREATE_THROTTLE_DURATION, "time throttled in snapshot create", "bytes"),
75-
meterRegistry.registerLongCounter(SNAPSHOT_RESTORE_THROTTLE_DURATION, "time throttled in snapshot restore", "bytes")
69+
meterRegistry.registerLongCounter(SNAPSHOT_RESTORE_THROTTLE_DURATION, "time throttled in snapshot restore", "bytes"),
70+
meterRegistry
7671
);
7772
}
7873

79-
public static Map<String, Object> createAttributesMap(RepositoryMetadata meta) {
80-
return Map.of("repo_type", meta.type(), "repo_name", meta.name());
74+
public void createSnapshotShardsInProgressMetric(Supplier<Collection<LongWithAttributes>> shardSnapshotsInProgressObserver) {
75+
meterRegistry.registerLongsGauge(
76+
SNAPSHOT_SHARDS_IN_PROGRESS,
77+
"shard snapshots in progress",
78+
"unit",
79+
shardSnapshotsInProgressObserver
80+
);
81+
}
82+
83+
public void createSnapshotsInProgressMetric(Supplier<Collection<LongWithAttributes>> snapshotsInProgressObserver) {
84+
meterRegistry.registerLongsGauge(SNAPSHOTS_IN_PROGRESS, "snapshots in progress", "unit", snapshotsInProgressObserver);
85+
}
86+
87+
public static Map<String, Object> createAttributesMap(ProjectId projectId, RepositoryMetadata meta) {
88+
return Map.of("project_id", projectId.id(), "repo_type", meta.type(), "repo_name", meta.name());
8189
}
8290
}

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -528,7 +528,7 @@ protected BlobStoreRepository(
528528
threadPool.info(ThreadPool.Names.SNAPSHOT).getMax(),
529529
threadPool.executor(ThreadPool.Names.SNAPSHOT)
530530
);
531-
this.blobStoreSnapshotMetrics = new BlobStoreSnapshotMetrics(metadata, snapshotMetrics);
531+
this.blobStoreSnapshotMetrics = new BlobStoreSnapshotMetrics(projectId, metadata, snapshotMetrics);
532532
}
533533

534534
@Override
@@ -3215,7 +3215,7 @@ public void snapshotShard(SnapshotShardContext context) {
32153215

32163216
private void doSnapshotShard(SnapshotShardContext context) {
32173217
blobStoreSnapshotMetrics.shardSnapshotStarted();
3218-
context.addListener(ActionListener.running(() -> blobStoreSnapshotMetrics.shardSnapshotCompleted(context.status().getTotalTime())));
3218+
context.addListener(ActionListener.running(() -> blobStoreSnapshotMetrics.shardSnapshotCompleted(context.status())));
32193219
if (isReadOnly()) {
32203220
context.onFailure(new RepositoryException(metadata.name(), "cannot snapshot shard on a readonly repository"));
32213221
return;

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreSnapshotMetrics.java

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@
99

1010
package org.elasticsearch.repositories.blobstore;
1111

12+
import org.elasticsearch.cluster.metadata.ProjectId;
1213
import org.elasticsearch.cluster.metadata.RepositoryMetadata;
1314
import org.elasticsearch.common.metrics.CounterMetric;
15+
import org.elasticsearch.index.snapshots.IndexShardSnapshotStatus;
1416
import org.elasticsearch.repositories.SnapshotMetrics;
1517
import org.elasticsearch.telemetry.metric.LongWithAttributes;
1618

@@ -30,9 +32,9 @@ public class BlobStoreSnapshotMetrics {
3032
private final CounterMetric numberOfShardSnapshotsCompleted = new CounterMetric();
3133
private final Map<String, Object> metricAttributes;
3234

33-
public BlobStoreSnapshotMetrics(RepositoryMetadata repositoryMetadata, SnapshotMetrics snapshotMetrics) {
35+
public BlobStoreSnapshotMetrics(ProjectId projectId, RepositoryMetadata repositoryMetadata, SnapshotMetrics snapshotMetrics) {
3436
this.snapshotMetrics = snapshotMetrics;
35-
metricAttributes = SnapshotMetrics.createAttributesMap(repositoryMetadata);
37+
metricAttributes = SnapshotMetrics.createAttributesMap(projectId, repositoryMetadata);
3638
}
3739

3840
public void incrementSnapshotRateLimitingTimeInNanos(long throttleTimeNanos) {
@@ -71,10 +73,10 @@ public void shardSnapshotStarted() {
7173
shardSnapshotsInProgress.inc();
7274
}
7375

74-
public void shardSnapshotCompleted(long durationInMillis) {
76+
public void shardSnapshotCompleted(IndexShardSnapshotStatus status) {
7577
snapshotMetrics.snapshotsShardsCompletedCounter().increment();
76-
if (durationInMillis > 0) {
77-
snapshotMetrics.snapshotShardsDurationHistogram().record(durationInMillis / 1_000f);
78+
if (status.getStage() == IndexShardSnapshotStatus.Stage.DONE) {
79+
snapshotMetrics.snapshotShardsDurationHistogram().record(status.getTotalTime() / 1_000f);
7880
}
7981
numberOfShardSnapshotsCompleted.inc();
8082
shardSnapshotsInProgress.dec();

0 commit comments

Comments
 (0)