Skip to content

Commit 3ecec85

Browse files
pxsalehismalyshev
authored andcommitted
Use Long instead of Double for allocation disk usage APM metrics (#116732)
I was trying to build a dashboard on top of these metrics and came across some zeros and negative values that I found a bit surprising. Also by mistake some long values are exposed as Double metrics. I've updated the metric test to make sure we have more concrete assertions. (note that the desired balance disk usage metric is double, so I'm keeping it as is).
1 parent 3b2d47b commit 3ecec85

File tree

3 files changed

+36
-18
lines changed

3 files changed

+36
-18
lines changed

server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerMetricsIT.java

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,12 @@
99

1010
package org.elasticsearch.cluster.routing.allocation.allocator;
1111

12+
import org.elasticsearch.cluster.ClusterInfoService;
13+
import org.elasticsearch.cluster.ClusterInfoServiceUtils;
14+
import org.elasticsearch.cluster.InternalClusterInfoService;
1215
import org.elasticsearch.cluster.node.DiscoveryNode;
1316
import org.elasticsearch.common.util.CollectionUtils;
17+
import org.elasticsearch.core.TimeValue;
1418
import org.elasticsearch.plugins.Plugin;
1519
import org.elasticsearch.plugins.PluginsService;
1620
import org.elasticsearch.telemetry.TestTelemetryPlugin;
@@ -56,8 +60,15 @@ public void testDesiredBalanceGaugeMetricsAreOnlyPublishedByCurrentMaster() thro
5660
public void testDesiredBalanceMetrics() {
5761
internalCluster().startNodes(2);
5862
prepareCreate("test").setSettings(indexSettings(2, 1)).get();
59-
indexRandom(randomBoolean(), "test", between(50, 100));
6063
ensureGreen();
64+
65+
indexRandom(randomBoolean(), "test", between(50, 100));
66+
flush("test");
67+
// Make sure new cluster info is available
68+
final var infoService = (InternalClusterInfoService) internalCluster().getCurrentMasterNodeInstance(ClusterInfoService.class);
69+
ClusterInfoServiceUtils.setUpdateFrequency(infoService, TimeValue.timeValueMillis(200));
70+
assertNotNull("info should not be null", ClusterInfoServiceUtils.refresh(infoService));
71+
6172
final var telemetryPlugin = getTelemetryPlugin(internalCluster().getMasterName());
6273
telemetryPlugin.collect();
6374
assertThat(telemetryPlugin.getLongGaugeMeasurement(DesiredBalanceMetrics.UNASSIGNED_SHARDS_METRIC_NAME), not(empty()));
@@ -73,7 +84,7 @@ public void testDesiredBalanceMetrics() {
7384
);
7485
assertThat(desiredBalanceNodeWeightsMetrics.size(), equalTo(2));
7586
for (var nodeStat : desiredBalanceNodeWeightsMetrics) {
76-
assertThat(nodeStat.value().doubleValue(), greaterThanOrEqualTo(0.0));
87+
assertTrue(nodeStat.isDouble());
7788
assertThat((String) nodeStat.attributes().get("node_id"), is(in(nodeIds)));
7889
assertThat((String) nodeStat.attributes().get("node_name"), is(in(nodeNames)));
7990
}
@@ -122,15 +133,16 @@ public void testDesiredBalanceMetrics() {
122133
assertThat((String) nodeStat.attributes().get("node_id"), is(in(nodeIds)));
123134
assertThat((String) nodeStat.attributes().get("node_name"), is(in(nodeNames)));
124135
}
125-
final var currentNodeDiskUsageMetrics = telemetryPlugin.getDoubleGaugeMeasurement(
136+
final var currentNodeDiskUsageMetrics = telemetryPlugin.getLongGaugeMeasurement(
126137
DesiredBalanceMetrics.CURRENT_NODE_DISK_USAGE_METRIC_NAME
127138
);
128139
assertThat(currentNodeDiskUsageMetrics.size(), equalTo(2));
129140
for (var nodeStat : currentNodeDiskUsageMetrics) {
130-
assertThat(nodeStat.value().doubleValue(), greaterThanOrEqualTo(0.0));
141+
assertThat(nodeStat.value().longValue(), greaterThanOrEqualTo(0L));
131142
assertThat((String) nodeStat.attributes().get("node_id"), is(in(nodeIds)));
132143
assertThat((String) nodeStat.attributes().get("node_name"), is(in(nodeNames)));
133144
}
145+
assertTrue(currentNodeDiskUsageMetrics.stream().anyMatch(m -> m.getLong() > 0L));
134146
final var currentNodeUndesiredShardCountMetrics = telemetryPlugin.getLongGaugeMeasurement(
135147
DesiredBalanceMetrics.CURRENT_NODE_UNDESIRED_SHARD_COUNT_METRIC_NAME
136148
);
@@ -140,15 +152,16 @@ public void testDesiredBalanceMetrics() {
140152
assertThat((String) nodeStat.attributes().get("node_id"), is(in(nodeIds)));
141153
assertThat((String) nodeStat.attributes().get("node_name"), is(in(nodeNames)));
142154
}
143-
final var currentNodeForecastedDiskUsageMetrics = telemetryPlugin.getDoubleGaugeMeasurement(
155+
final var currentNodeForecastedDiskUsageMetrics = telemetryPlugin.getLongGaugeMeasurement(
144156
DesiredBalanceMetrics.CURRENT_NODE_FORECASTED_DISK_USAGE_METRIC_NAME
145157
);
146158
assertThat(currentNodeForecastedDiskUsageMetrics.size(), equalTo(2));
147159
for (var nodeStat : currentNodeForecastedDiskUsageMetrics) {
148-
assertThat(nodeStat.value().doubleValue(), greaterThanOrEqualTo(0.0));
160+
assertThat(nodeStat.value().longValue(), greaterThanOrEqualTo(0L));
149161
assertThat((String) nodeStat.attributes().get("node_id"), is(in(nodeIds)));
150162
assertThat((String) nodeStat.attributes().get("node_name"), is(in(nodeNames)));
151163
}
164+
assertTrue(currentNodeForecastedDiskUsageMetrics.stream().anyMatch(m -> m.getLong() > 0L));
152165
}
153166

154167
private static void assertOnlyMasterIsPublishingMetrics() {
@@ -182,10 +195,10 @@ private static void assertMetricsAreBeingPublished(String nodeName, boolean shou
182195
matcher
183196
);
184197
assertThat(testTelemetryPlugin.getDoubleGaugeMeasurement(DesiredBalanceMetrics.CURRENT_NODE_WRITE_LOAD_METRIC_NAME), matcher);
185-
assertThat(testTelemetryPlugin.getDoubleGaugeMeasurement(DesiredBalanceMetrics.CURRENT_NODE_DISK_USAGE_METRIC_NAME), matcher);
198+
assertThat(testTelemetryPlugin.getLongGaugeMeasurement(DesiredBalanceMetrics.CURRENT_NODE_DISK_USAGE_METRIC_NAME), matcher);
186199
assertThat(testTelemetryPlugin.getLongGaugeMeasurement(DesiredBalanceMetrics.CURRENT_NODE_SHARD_COUNT_METRIC_NAME), matcher);
187200
assertThat(
188-
testTelemetryPlugin.getDoubleGaugeMeasurement(DesiredBalanceMetrics.CURRENT_NODE_FORECASTED_DISK_USAGE_METRIC_NAME),
201+
testTelemetryPlugin.getLongGaugeMeasurement(DesiredBalanceMetrics.CURRENT_NODE_FORECASTED_DISK_USAGE_METRIC_NAME),
189202
matcher
190203
);
191204
assertThat(

server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceMetrics.java

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ public DesiredBalanceMetrics(MeterRegistry meterRegistry) {
136136
"threads",
137137
this::getCurrentNodeWriteLoadMetrics
138138
);
139-
meterRegistry.registerDoublesGauge(
139+
meterRegistry.registerLongsGauge(
140140
CURRENT_NODE_DISK_USAGE_METRIC_NAME,
141141
"The current disk usage of nodes",
142142
"bytes",
@@ -148,7 +148,7 @@ public DesiredBalanceMetrics(MeterRegistry meterRegistry) {
148148
"unit",
149149
this::getCurrentNodeShardCountMetrics
150150
);
151-
meterRegistry.registerDoublesGauge(
151+
meterRegistry.registerLongsGauge(
152152
CURRENT_NODE_FORECASTED_DISK_USAGE_METRIC_NAME,
153153
"The current forecasted disk usage of nodes",
154154
"bytes",
@@ -231,16 +231,16 @@ private List<LongWithAttributes> getDesiredBalanceNodeShardCountMetrics() {
231231
return values;
232232
}
233233

234-
private List<DoubleWithAttributes> getCurrentNodeDiskUsageMetrics() {
234+
private List<LongWithAttributes> getCurrentNodeDiskUsageMetrics() {
235235
if (nodeIsMaster == false) {
236236
return List.of();
237237
}
238238
var stats = allocationStatsPerNodeRef.get();
239-
List<DoubleWithAttributes> doubles = new ArrayList<>(stats.size());
239+
List<LongWithAttributes> values = new ArrayList<>(stats.size());
240240
for (var node : stats.keySet()) {
241-
doubles.add(new DoubleWithAttributes(stats.get(node).currentDiskUsage(), getNodeAttributes(node)));
241+
values.add(new LongWithAttributes(stats.get(node).currentDiskUsage(), getNodeAttributes(node)));
242242
}
243-
return doubles;
243+
return values;
244244
}
245245

246246
private List<DoubleWithAttributes> getCurrentNodeWriteLoadMetrics() {
@@ -267,16 +267,16 @@ private List<LongWithAttributes> getCurrentNodeShardCountMetrics() {
267267
return values;
268268
}
269269

270-
private List<DoubleWithAttributes> getCurrentNodeForecastedDiskUsageMetrics() {
270+
private List<LongWithAttributes> getCurrentNodeForecastedDiskUsageMetrics() {
271271
if (nodeIsMaster == false) {
272272
return List.of();
273273
}
274274
var stats = allocationStatsPerNodeRef.get();
275-
List<DoubleWithAttributes> doubles = new ArrayList<>(stats.size());
275+
List<LongWithAttributes> values = new ArrayList<>(stats.size());
276276
for (var node : stats.keySet()) {
277-
doubles.add(new DoubleWithAttributes(stats.get(node).forecastedDiskUsage(), getNodeAttributes(node)));
277+
values.add(new LongWithAttributes(stats.get(node).forecastedDiskUsage(), getNodeAttributes(node)));
278278
}
279-
return doubles;
279+
return values;
280280
}
281281

282282
private List<LongWithAttributes> getCurrentNodeUndesiredShardCountMetrics() {

test/framework/src/main/java/org/elasticsearch/cluster/ClusterInfoServiceUtils.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import org.apache.logging.log4j.Logger;
1414
import org.elasticsearch.action.support.PlainActionFuture;
1515
import org.elasticsearch.cluster.service.ClusterApplierService;
16+
import org.elasticsearch.core.TimeValue;
1617

1718
import java.util.concurrent.TimeUnit;
1819

@@ -37,4 +38,8 @@ protected boolean blockingAllowed() {
3738
throw new AssertionError(e);
3839
}
3940
}
41+
42+
public static void setUpdateFrequency(InternalClusterInfoService internalClusterInfoService, TimeValue updateFrequency) {
43+
internalClusterInfoService.setUpdateFrequency(updateFrequency);
44+
}
4045
}

0 commit comments

Comments
 (0)