Skip to content

Commit b48f699

Browse files
authored
[ML] Add Telemetry for models without adaptive allocations (#129161)
Added min and max allocations as attributes to the telemetry for trained models with adaptive allocations enabled. Added telemetry for models with adaptive allocations disabled or never set.
1 parent c4f7b97 commit b48f699

File tree

5 files changed

+82
-7
lines changed

5 files changed

+82
-7
lines changed

docs/changelog/129161.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 129161
2+
summary: Add Telemetry for models without adaptive allocations
3+
area: Machine Learning
4+
type: enhancement
5+
issues: []

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MlMetrics.java

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,25 @@ private void registerMasterNodeMetrics(MeterRegistry meterRegistry) {
272272
() -> new LongWithAttributes(trainedModelAllocationCounts.trainedModelsFailedAllocations, isMasterMap)
273273
)
274274
);
275+
metrics.add(
276+
meterRegistry.registerLongGauge(
277+
"es.ml.trained_models.deployment.fixed_allocations.current",
278+
"Sum of current trained model allocations that do not use adaptive allocations (either enabled or disabled)",
279+
"allocations",
280+
() -> new LongWithAttributes(trainedModelAllocationCounts.deploymentsWithFixedAllocations, isMasterMap)
281+
)
282+
);
283+
/*
284+
* AdaptiveAllocationsScalerService tracks the number of allocations with adaptive allocations enabled.
285+
*/
286+
metrics.add(
287+
meterRegistry.registerLongGauge(
288+
"es.ml.trained_models.deployment.disabled_adaptive_allocations.current",
289+
"Sum of current trained model allocations that have adaptive allocations disabled",
290+
"allocations",
291+
() -> new LongWithAttributes(trainedModelAllocationCounts.deploymentsWithDisabledAdaptiveAllocations, isMasterMap)
292+
)
293+
);
275294
}
276295

277296
@Override
@@ -484,17 +503,28 @@ static TrainedModelAllocationCounts findTrainedModelAllocationCounts(TrainedMode
484503
int trainedModelsTargetAllocations = 0;
485504
int trainedModelsCurrentAllocations = 0;
486505
int trainedModelsFailedAllocations = 0;
506+
int deploymentsWithFixedAllocations = 0;
507+
int deploymentsWithDisabledAdaptiveAllocations = 0;
487508

488509
for (TrainedModelAssignment trainedModelAssignment : metadata.allAssignments().values()) {
489510
trainedModelsTargetAllocations += trainedModelAssignment.totalTargetAllocations();
490-
trainedModelsCurrentAllocations += trainedModelAssignment.totalCurrentAllocations();
491511
trainedModelsFailedAllocations += trainedModelAssignment.totalFailedAllocations();
512+
trainedModelsCurrentAllocations += trainedModelAssignment.totalCurrentAllocations();
513+
514+
if (trainedModelAssignment.getAdaptiveAllocationsSettings() == null) {
515+
deploymentsWithFixedAllocations += 1;
516+
} else if ((trainedModelAssignment.getAdaptiveAllocationsSettings().getEnabled() == null)
517+
|| (trainedModelAssignment.getAdaptiveAllocationsSettings().getEnabled() == false)) {
518+
deploymentsWithDisabledAdaptiveAllocations += 1;
519+
}
492520
}
493521

494522
return new TrainedModelAllocationCounts(
495523
trainedModelsTargetAllocations,
496524
trainedModelsCurrentAllocations,
497-
trainedModelsFailedAllocations
525+
trainedModelsFailedAllocations,
526+
deploymentsWithFixedAllocations,
527+
deploymentsWithDisabledAdaptiveAllocations
498528
);
499529
}
500530

@@ -556,8 +586,10 @@ record MlTaskStatusCounts(
556586
record TrainedModelAllocationCounts(
557587
int trainedModelsTargetAllocations,
558588
int trainedModelsCurrentAllocations,
559-
int trainedModelsFailedAllocations
589+
int trainedModelsFailedAllocations,
590+
int deploymentsWithFixedAllocations,
591+
int deploymentsWithDisabledAdaptiveAllocations
560592
) {
561-
static final TrainedModelAllocationCounts EMPTY = new TrainedModelAllocationCounts(0, 0, 0);
593+
static final TrainedModelAllocationCounts EMPTY = new TrainedModelAllocationCounts(0, 0, 0, 0, 0);
562594
}
563595
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScaler.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,4 +239,12 @@ public Double getLastMeasuredInferenceTime() {
239239
public Long getLastMeasuredQueueSize() {
240240
return lastMeasuredQueueSize;
241241
}
242+
243+
public Integer getMinNumberOfAllocations() {
244+
return minNumberOfAllocations;
245+
}
246+
247+
public Integer getMaxNumberOfAllocations() {
248+
return maxNumberOfAllocations;
249+
}
242250
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ void init() {
105105
"es.ml.trained_models.adaptive_allocations.actual_number_of_allocations.current",
106106
"the actual number of allocations",
107107
"",
108-
() -> observeLong(AdaptiveAllocationsScaler::getNumberOfAllocations)
108+
this::observeAllocationCount
109109
)
110110
);
111111
metrics.add(
@@ -179,6 +179,19 @@ Collection<DoubleWithAttributes> observeDouble(Function<AdaptiveAllocationsScale
179179
}
180180
return observations;
181181
}
182+
183+
Collection<LongWithAttributes> observeAllocationCount() {
184+
return scalers.values().stream().map(scaler -> {
185+
var value = scaler.getNumberOfAllocations();
186+
var min = scaler.getMinNumberOfAllocations();
187+
var scalesToZero = min == null || min == 0;
188+
189+
return new LongWithAttributes(
190+
value,
191+
Map.ofEntries(Map.entry("deployment_id", scaler.getDeploymentId()), Map.entry("scales_to_zero", scalesToZero))
192+
);
193+
}).toList();
194+
}
182195
}
183196

184197
/**

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MlMetricsTests.java

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import org.elasticsearch.xpack.core.ml.datafeed.DatafeedState;
2222
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
2323
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsState;
24+
import org.elasticsearch.xpack.core.ml.inference.assignment.AdaptiveAllocationsSettings;
2425
import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingInfo;
2526
import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingState;
2627
import org.elasticsearch.xpack.core.ml.inference.assignment.TrainedModelAssignment;
@@ -146,11 +147,27 @@ public void testFindTrainedModelAllocationCounts() {
146147
TrainedModelAssignment.Builder.empty(mock(StartTrainedModelDeploymentAction.TaskParams.class), null)
147148
.addRoutingEntry("node2", new RoutingInfo(0, 1, RoutingState.STARTING, ""))
148149
);
150+
metadataBuilder.addNewAssignment(
151+
"model4",
152+
TrainedModelAssignment.Builder.empty(
153+
mock(StartTrainedModelDeploymentAction.TaskParams.class),
154+
new AdaptiveAllocationsSettings(true, 0, 1)
155+
).addRoutingEntry("node1", new RoutingInfo(0, 0, RoutingState.STARTING, ""))
156+
);
157+
metadataBuilder.addNewAssignment(
158+
"model5",
159+
TrainedModelAssignment.Builder.empty(
160+
mock(StartTrainedModelDeploymentAction.TaskParams.class),
161+
new AdaptiveAllocationsSettings(false, 1, 1)
162+
).addRoutingEntry("node1", new RoutingInfo(1, 1, RoutingState.STARTING, ""))
163+
);
149164

150165
MlMetrics.TrainedModelAllocationCounts counts = MlMetrics.findTrainedModelAllocationCounts(metadataBuilder.build());
151-
assertThat(counts.trainedModelsTargetAllocations(), is(5));
152-
assertThat(counts.trainedModelsCurrentAllocations(), is(3));
166+
assertThat(counts.trainedModelsTargetAllocations(), is(6));
167+
assertThat(counts.trainedModelsCurrentAllocations(), is(4));
153168
assertThat(counts.trainedModelsFailedAllocations(), is(1));
169+
assertThat(counts.deploymentsWithFixedAllocations(), is(3));
170+
assertThat(counts.deploymentsWithDisabledAdaptiveAllocations(), is(1));
154171
}
155172

156173
public void testFindNativeMemoryFree() {

0 commit comments

Comments
 (0)