Skip to content

Commit aab140f

Browse files
authored
IGNITE-27150 Add raft snapshot metrics (#7269)
1 parent 6e6990c commit aab140f

File tree

28 files changed

+718
-43
lines changed

28 files changed

+718
-43
lines changed

docs/_docs/administrators-guide/metrics/metrics-list.adoc

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,3 +296,21 @@ Transaction metrics.
296296
| LocalUnrebalancedPartitionsCount | The number of partitions that should be moved to this node.
297297
| TotalUnrebalancedPartitionsCount | The total number of partitions that should be moved to a new owner.
298298
|=======================================================================
299+
300+
== raft.snapshots
301+
302+
Metrics related to Raft snapshots of partition replicas.
303+
304+
[width="100%",cols="20%,80%",opts="header"]
305+
|=======================================================================
306+
| Metric name | Description
307+
308+
| IncomingSnapshots | The number of incoming Raft snapshots in progress.
309+
| IncomingSnapshotsLoadingMeta | The number of incoming Raft snapshots loading metadata.
310+
| IncomingSnapshotsWaitingCatalog | The number of incoming Raft snapshots waiting for catalog.
311+
| IncomingSnapshotsPreparingStorages | The number of incoming Raft snapshots preparing storages.
312+
| IncomingSnapshotsPreparingIndexForBuild | The number of incoming Raft snapshots preparing indexes for build.
313+
| IncomingSnapshotsLoadingMvData | The number of incoming Raft snapshots loading multi-versioned data.
314+
| IncomingSnapshotsLoadingTxMeta | The number of incoming Raft snapshots loading transaction metadata.
315+
| OutgoingSnapshots | The number of outgoing Raft snapshots in progress.
316+
|=======================================================================

modules/cli/src/integrationTest/java/org/apache/ignite/internal/cli/CliIntegrationTest.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@ public abstract class CliIntegrationTest extends ClusterPerClassIntegrationTest
8282
new MetricSource().name("placement-driver").enabled(true),
8383
new MetricSource().name("resource.vacuum").enabled(true),
8484
new MetricSource().name("clock.service").enabled(true),
85-
new MetricSource().name("index.builder").enabled(true)
85+
new MetricSource().name("index.builder").enabled(true),
86+
new MetricSource().name("raft.snapshots").enabled(true)
8687
};
8788

8889
/** Correct ignite jdbc url. */

modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/rebalance/ItRebalanceDistributedTest.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1556,7 +1556,8 @@ private class Node {
15561556
txManager,
15571557
schemaManager,
15581558
dataStorageMgr,
1559-
outgoingSnapshotManager
1559+
outgoingSnapshotManager,
1560+
metricManager
15601561
);
15611562

15621563
tableManager = new TableManager(

modules/partition-replicator/build.gradle

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ dependencies {
5252
testImplementation testFixtures(project(':ignite-core'))
5353
testImplementation testFixtures(project(':ignite-low-watermark'))
5454
testImplementation testFixtures(project(':ignite-metastorage'))
55+
testImplementation testFixtures(project(':ignite-metrics'))
5556
testImplementation testFixtures(project(':ignite-schema'))
5657
testImplementation testFixtures(project(':ignite-storage-api'))
5758
testImplementation testFixtures(project(':ignite-table'))
@@ -60,6 +61,8 @@ dependencies {
6061
testImplementation testFixtures(project(':ignite-raft'))
6162
testImplementation testFixtures(project(':ignite-distribution-zones'))
6263

64+
testImplementation libs.awaitility
65+
6366
integrationTestImplementation testFixtures(project(':ignite-cluster-management'))
6467
integrationTestImplementation testFixtures(project(':ignite-core'))
6568
integrationTestImplementation testFixtures(project(':ignite-configuration'))

modules/partition-replicator/src/integrationTest/java/org/apache/ignite/internal/partition/replicator/fixtures/Node.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -744,7 +744,8 @@ public CompletableFuture<Boolean> invoke(Condition condition, Operation success,
744744
txManager,
745745
schemaManager,
746746
dataStorageMgr,
747-
outgoingSnapshotsManager
747+
outgoingSnapshotsManager,
748+
metricManager
748749
);
749750

750751
resourceVacuumManager = new ResourceVacuumManager(

modules/partition-replicator/src/integrationTest/java/org/apache/ignite/internal/partition/replicator/raft/ItZonePartitionRaftListenerRecoveryTest.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
import org.apache.ignite.internal.partition.replicator.raft.snapshot.PartitionSnapshotStorageFactory;
8484
import org.apache.ignite.internal.partition.replicator.raft.snapshot.PartitionTxStateAccessImpl;
8585
import org.apache.ignite.internal.partition.replicator.raft.snapshot.ZonePartitionKey;
86+
import org.apache.ignite.internal.partition.replicator.raft.snapshot.metrics.RaftSnapshotsMetricsSource;
8687
import org.apache.ignite.internal.partition.replicator.raft.snapshot.outgoing.OutgoingSnapshotsManager;
8788
import org.apache.ignite.internal.placementdriver.LeasePlacementDriver;
8889
import org.apache.ignite.internal.raft.Loza;
@@ -311,7 +312,8 @@ void setUp(
311312
catalogService,
312313
failureProcessor,
313314
executor,
314-
new LogStorageAccessImpl(replicaManager)
315+
new LogStorageAccessImpl(replicaManager),
316+
new RaftSnapshotsMetricsSource()
315317
);
316318
}
317319

modules/partition-replicator/src/main/java/org/apache/ignite/internal/partition/replicator/PartitionReplicaLifecycleManager.java

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@
127127
import org.apache.ignite.internal.metastorage.WatchListener;
128128
import org.apache.ignite.internal.metastorage.dsl.Condition;
129129
import org.apache.ignite.internal.metastorage.dsl.Operation;
130+
import org.apache.ignite.internal.metrics.MetricManager;
130131
import org.apache.ignite.internal.network.InternalClusterNode;
131132
import org.apache.ignite.internal.network.RecipientLeftException;
132133
import org.apache.ignite.internal.network.TopologyService;
@@ -265,6 +266,8 @@ public class PartitionReplicaLifecycleManager extends
265266

266267
private final ZoneResourcesManager zoneResourcesManager;
267268

269+
private final MetricManager metricManager;
270+
268271
private final ReliableCatalogVersions reliableCatalogVersions;
269272

270273
private final EventListener<CreateZoneEventParameters> onCreateZoneListener = this::onCreateZone;
@@ -295,6 +298,7 @@ public class PartitionReplicaLifecycleManager extends
295298
* @param schemaManager Schema manager.
296299
* @param dataStorageManager Data storage manager.
297300
* @param outgoingSnapshotsManager Outgoing snapshots manager.
301+
* @param metricManager Metric manager.
298302
*/
299303
public PartitionReplicaLifecycleManager(
300304
CatalogService catalogService,
@@ -316,7 +320,8 @@ public PartitionReplicaLifecycleManager(
316320
TxManager txManager,
317321
SchemaManager schemaManager,
318322
DataStorageManager dataStorageManager,
319-
OutgoingSnapshotsManager outgoingSnapshotsManager
323+
OutgoingSnapshotsManager outgoingSnapshotsManager,
324+
MetricManager metricManager
320325
) {
321326
this(
322327
catalogService,
@@ -346,7 +351,8 @@ public PartitionReplicaLifecycleManager(
346351
failureProcessor,
347352
partitionOperationsExecutor,
348353
replicaMgr
349-
)
354+
),
355+
metricManager
350356
);
351357
}
352358

@@ -370,7 +376,8 @@ public PartitionReplicaLifecycleManager(
370376
TxManager txManager,
371377
SchemaManager schemaManager,
372378
DataStorageManager dataStorageManager,
373-
ZoneResourcesManager zoneResourcesManager
379+
ZoneResourcesManager zoneResourcesManager,
380+
MetricManager metricManager
374381
) {
375382
this.catalogService = catalogService;
376383
this.replicaMgr = replicaMgr;
@@ -390,6 +397,7 @@ public PartitionReplicaLifecycleManager(
390397
this.schemaManager = schemaManager;
391398
this.dataStorageManager = dataStorageManager;
392399
this.zoneResourcesManager = zoneResourcesManager;
400+
this.metricManager = metricManager;
393401

394402
rebalanceRetryDelayConfiguration = new SystemDistributedConfigurationPropertyHolder<>(
395403
systemDistributedConfiguration,
@@ -431,6 +439,9 @@ public CompletableFuture<Void> startAsync(ComponentContext componentContext) {
431439

432440
executorInclinedPlacementDriver.listen(PrimaryReplicaEvent.PRIMARY_REPLICA_EXPIRED, onPrimaryReplicaExpiredListener);
433441

442+
metricManager.registerSource(zoneResourcesManager.snapshotsMetricsSource());
443+
metricManager.enable(zoneResourcesManager.snapshotsMetricsSource());
444+
434445
return processZonesAndAssignmentsOnStart;
435446
}
436447

@@ -1657,6 +1668,8 @@ public CompletableFuture<Void> stopAsync(ComponentContext componentContext) {
16571668
}
16581669

16591670
try {
1671+
metricManager.unregisterSource(zoneResourcesManager.snapshotsMetricsSource());
1672+
16601673
IgniteUtils.closeAllManually(zoneResourcesManager);
16611674
} catch (Exception e) {
16621675
return failedFuture(e);

modules/partition-replicator/src/main/java/org/apache/ignite/internal/partition/replicator/ZoneResourcesManager.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import org.apache.ignite.internal.partition.replicator.raft.snapshot.PartitionSnapshotStorage;
3535
import org.apache.ignite.internal.partition.replicator.raft.snapshot.PartitionTxStateAccessImpl;
3636
import org.apache.ignite.internal.partition.replicator.raft.snapshot.ZonePartitionKey;
37+
import org.apache.ignite.internal.partition.replicator.raft.snapshot.metrics.RaftSnapshotsMetricsSource;
3738
import org.apache.ignite.internal.partition.replicator.raft.snapshot.outgoing.OutgoingSnapshotsManager;
3839
import org.apache.ignite.internal.replicator.ReplicaManager;
3940
import org.apache.ignite.internal.replicator.ZonePartitionId;
@@ -70,6 +71,8 @@ public class ZoneResourcesManager implements ManuallyCloseable {
7071

7172
private final ReplicaManager replicaManager;
7273

74+
private final RaftSnapshotsMetricsSource snapshotsMetricsSource = new RaftSnapshotsMetricsSource();
75+
7376
/** Map from zone IDs to their resource holders. */
7477
private final Map<Integer, ZoneResources> resourcesByZoneId = new ConcurrentHashMap<>();
7578

@@ -128,7 +131,8 @@ ZonePartitionResources allocateZonePartitionResources(
128131
catalogService,
129132
failureProcessor,
130133
partitionOperationsExecutor,
131-
new LogStorageAccessImpl(replicaManager)
134+
new LogStorageAccessImpl(replicaManager),
135+
snapshotsMetricsSource
132136
);
133137

134138
var zonePartitionResources = new ZonePartitionResources(
@@ -218,6 +222,10 @@ TxStatePartitionStorage txStatePartitionStorage(int zoneId, int partitionId) {
218222
return resources.txStateStorage.getPartitionStorage(partitionId);
219223
}
220224

225+
RaftSnapshotsMetricsSource snapshotsMetricsSource() {
226+
return snapshotsMetricsSource;
227+
}
228+
221229
private static class ZoneResources {
222230

223231
final TxStateStorage txStateStorage;

modules/partition-replicator/src/main/java/org/apache/ignite/internal/partition/replicator/raft/snapshot/PartitionSnapshotStorage.java

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import org.apache.ignite.internal.network.MessagingService;
3737
import org.apache.ignite.internal.network.TopologyService;
3838
import org.apache.ignite.internal.partition.replicator.raft.snapshot.incoming.IncomingSnapshotCopier;
39+
import org.apache.ignite.internal.partition.replicator.raft.snapshot.metrics.RaftSnapshotsMetricsSource;
3940
import org.apache.ignite.internal.partition.replicator.raft.snapshot.outgoing.OutgoingSnapshotReader;
4041
import org.apache.ignite.internal.partition.replicator.raft.snapshot.outgoing.OutgoingSnapshotsManager;
4142
import org.apache.ignite.internal.raft.RaftGroupConfiguration;
@@ -100,6 +101,8 @@ public class PartitionSnapshotStorage {
100101

101102
private final LogStorageAccess logStorage;
102103

104+
private final RaftSnapshotsMetricsSource snapshotsMetricsSource;
105+
103106
/** Constructor. */
104107
public PartitionSnapshotStorage(
105108
ZonePartitionKey partitionKey,
@@ -109,7 +112,8 @@ public PartitionSnapshotStorage(
109112
CatalogService catalogService,
110113
FailureProcessor failureProcessor,
111114
Executor incomingSnapshotsExecutor,
112-
LogStorageAccess logStorage
115+
LogStorageAccess logStorage,
116+
RaftSnapshotsMetricsSource snapshotsMetricsSource
113117
) {
114118
this(
115119
partitionKey,
@@ -120,7 +124,8 @@ public PartitionSnapshotStorage(
120124
failureProcessor,
121125
incomingSnapshotsExecutor,
122126
DEFAULT_WAIT_FOR_METADATA_CATCHUP_MS,
123-
logStorage
127+
logStorage,
128+
snapshotsMetricsSource
124129
);
125130
}
126131

@@ -134,7 +139,8 @@ public PartitionSnapshotStorage(
134139
FailureProcessor failureProcessor,
135140
Executor incomingSnapshotsExecutor,
136141
long waitForMetadataCatchupMs,
137-
LogStorageAccess logStorage
142+
LogStorageAccess logStorage,
143+
RaftSnapshotsMetricsSource snapshotsMetricsSource
138144
) {
139145
this.partitionKey = partitionKey;
140146
this.topologyService = topologyService;
@@ -145,6 +151,7 @@ public PartitionSnapshotStorage(
145151
this.incomingSnapshotsExecutor = incomingSnapshotsExecutor;
146152
this.waitForMetadataCatchupMs = waitForMetadataCatchupMs;
147153
this.logStorage = logStorage;
154+
this.snapshotsMetricsSource = snapshotsMetricsSource;
148155
}
149156

150157
public ZonePartitionKey partitionKey() {
@@ -231,7 +238,13 @@ public SnapshotCopier startIncomingSnapshot(String uri) {
231238

232239
SnapshotUri snapshotUri = SnapshotUri.fromStringUri(uri);
233240

234-
var copier = new IncomingSnapshotCopier(this, snapshotUri, incomingSnapshotsExecutor, waitForMetadataCatchupMs) {
241+
var copier = new IncomingSnapshotCopier(
242+
this,
243+
snapshotUri,
244+
incomingSnapshotsExecutor,
245+
waitForMetadataCatchupMs,
246+
snapshotsMetricsSource
247+
) {
235248
@Override
236249
public void close() {
237250
try {
@@ -257,7 +270,7 @@ public SnapshotReader startOutgoingSnapshot() {
257270

258271
startSnapshotOperation(snapshotId);
259272

260-
return new OutgoingSnapshotReader(snapshotId, this) {
273+
return new OutgoingSnapshotReader(snapshotId, this, snapshotsMetricsSource) {
261274
@Override
262275
public void close() throws IOException {
263276
try {
@@ -282,6 +295,8 @@ private void completeSnapshotOperation(UUID snapshotId) {
282295
LOG.info("Finishing outgoing snapshot [partitionKey={}, snapshotId={}]", partitionKey, snapshotId);
283296

284297
synchronized (snapshotOperationLock) {
298+
LOG.info("Finishing outgoing snapshot [partitionKey={}, snapshotId={}]", partitionKey, snapshotId);
299+
285300
CompletableFuture<Void> operationFuture = ongoingSnapshotOperations.remove(snapshotId);
286301

287302
assert operationFuture != null :

0 commit comments

Comments
 (0)