From 398a37e6fcf6b5622af747d2b63889afcebbb6a1 Mon Sep 17 00:00:00 2001 From: Pat Whelan Date: Fri, 6 Jun 2025 17:48:26 -0400 Subject: [PATCH 1/4] [ML] Add Telemetry for models without adaptive allocations Added min and max allocations as attributes to the telemetry for trained models with adaptive allocations enabled. Added telemetry for models with adaptive allocations disabled or never set. --- .../org/elasticsearch/xpack/ml/MlMetrics.java | 40 +++++++++++++++++-- .../AdaptiveAllocationsScaler.java | 8 ++++ .../AdaptiveAllocationsScalerService.java | 19 ++++++++- .../xpack/ml/MlMetricsTests.java | 21 +++++++++- 4 files changed, 81 insertions(+), 7 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MlMetrics.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MlMetrics.java index 65d24a13564bc..98ff3ec9944d3 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MlMetrics.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MlMetrics.java @@ -272,6 +272,25 @@ private void registerMasterNodeMetrics(MeterRegistry meterRegistry) { () -> new LongWithAttributes(trainedModelAllocationCounts.trainedModelsFailedAllocations, isMasterMap) ) ); + metrics.add( + meterRegistry.registerLongGauge( + "es.ml.trained_models.deployment.fixed_allocations.current", + "Sum of current trained model allocations that do not use adaptive allocations (either enabled or disabled)", + "allocations", + () -> new LongWithAttributes(trainedModelAllocationCounts.trainedModelsFixedAllocations, isMasterMap) + ) + ); + /* + * AdaptiveAllocationsScalerService tracks the number of allocations with adaptive allocations enabled. + */ + metrics.add( + meterRegistry.registerLongGauge( + "es.ml.trained_models.deployment.disabled_adaptive_allocations.current", + "Sum of current trained model allocations that have adaptive allocations disabled", + "allocations", + () -> new LongWithAttributes(trainedModelAllocationCounts.trainedModelsDisabledAdaptiveAllocations, isMasterMap) + ) + ); } @Override @@ -484,17 +503,28 @@ static TrainedModelAllocationCounts findTrainedModelAllocationCounts(TrainedMode int trainedModelsTargetAllocations = 0; int trainedModelsCurrentAllocations = 0; int trainedModelsFailedAllocations = 0; + int trainedModelsFixedAllocations = 0; + int trainedModelsDisabledAdaptiveAllocations = 0; for (TrainedModelAssignment trainedModelAssignment : metadata.allAssignments().values()) { trainedModelsTargetAllocations += trainedModelAssignment.totalTargetAllocations(); - trainedModelsCurrentAllocations += trainedModelAssignment.totalCurrentAllocations(); trainedModelsFailedAllocations += trainedModelAssignment.totalFailedAllocations(); + + trainedModelsCurrentAllocations += trainedModelAssignment.totalCurrentAllocations(); + if (trainedModelAssignment.getAdaptiveAllocationsSettings() == null) { + trainedModelsFixedAllocations += trainedModelAssignment.totalCurrentAllocations(); + } else if ((trainedModelAssignment.getAdaptiveAllocationsSettings().getEnabled() == null) + || (trainedModelAssignment.getAdaptiveAllocationsSettings().getEnabled() == false)) { + trainedModelsDisabledAdaptiveAllocations += trainedModelAssignment.totalCurrentAllocations(); + } } return new TrainedModelAllocationCounts( trainedModelsTargetAllocations, trainedModelsCurrentAllocations, - trainedModelsFailedAllocations + trainedModelsFailedAllocations, + trainedModelsFixedAllocations, + trainedModelsDisabledAdaptiveAllocations ); } @@ -556,8 +586,10 @@ record MlTaskStatusCounts( record TrainedModelAllocationCounts( int trainedModelsTargetAllocations, int trainedModelsCurrentAllocations, - int trainedModelsFailedAllocations + int trainedModelsFailedAllocations, + int trainedModelsFixedAllocations, + int trainedModelsDisabledAdaptiveAllocations ) { - static final TrainedModelAllocationCounts EMPTY = new TrainedModelAllocationCounts(0, 0, 0); + static final TrainedModelAllocationCounts EMPTY = new TrainedModelAllocationCounts(0, 0, 0, 0, 0); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScaler.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScaler.java index 28e2380c2a9d4..537d65dca9255 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScaler.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScaler.java @@ -239,4 +239,12 @@ public Double getLastMeasuredInferenceTime() { public Long getLastMeasuredQueueSize() { return lastMeasuredQueueSize; } + + public Integer getMinNumberOfAllocations() { + return minNumberOfAllocations; + } + + public Integer getMaxNumberOfAllocations() { + return maxNumberOfAllocations; + } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java index 4ca3d2f02a02d..25f485be10add 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java @@ -40,6 +40,7 @@ import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -105,7 +106,7 @@ void init() { "es.ml.trained_models.adaptive_allocations.actual_number_of_allocations.current", "the actual number of allocations", "", - () -> observeLong(AdaptiveAllocationsScaler::getNumberOfAllocations) + this::observeAllocationCount ) ); metrics.add( @@ -179,6 +180,22 @@ Collection observeDouble(Function observeAllocationCount() { + return scalers.values().stream().map(scaler -> { + var value = scaler.getNumberOfAllocations(); + var min = scaler.getMinNumberOfAllocations(); + var max = scaler.getMaxNumberOfAllocations(); + + var attributes = new HashMap(3); + attributes.put("deployment_id", scaler.getDeploymentId()); + attributes.put("min_number_of_allocations", min != null ? min : 0); + if (max != null) { + attributes.put("max_number_of_allocations", max); + } + return new LongWithAttributes(value, Collections.unmodifiableMap(attributes)); + }).toList(); + } } /** diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MlMetricsTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MlMetricsTests.java index 5fb1381b881ea..53a10771bfecc 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MlMetricsTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MlMetricsTests.java @@ -21,6 +21,7 @@ import org.elasticsearch.xpack.core.ml.datafeed.DatafeedState; import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig; import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsState; +import org.elasticsearch.xpack.core.ml.inference.assignment.AdaptiveAllocationsSettings; import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingInfo; import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingState; import org.elasticsearch.xpack.core.ml.inference.assignment.TrainedModelAssignment; @@ -146,11 +147,27 @@ public void testFindTrainedModelAllocationCounts() { TrainedModelAssignment.Builder.empty(mock(StartTrainedModelDeploymentAction.TaskParams.class), null) .addRoutingEntry("node2", new RoutingInfo(0, 1, RoutingState.STARTING, "")) ); + metadataBuilder.addNewAssignment( + "model4", + TrainedModelAssignment.Builder.empty( + mock(StartTrainedModelDeploymentAction.TaskParams.class), + new AdaptiveAllocationsSettings(true, 0, 1) + ).addRoutingEntry("node1", new RoutingInfo(0, 0, RoutingState.STARTING, "")) + ); + metadataBuilder.addNewAssignment( + "model5", + TrainedModelAssignment.Builder.empty( + mock(StartTrainedModelDeploymentAction.TaskParams.class), + new AdaptiveAllocationsSettings(false, 1, 1) + ).addRoutingEntry("node1", new RoutingInfo(1, 1, RoutingState.STARTING, "")) + ); MlMetrics.TrainedModelAllocationCounts counts = MlMetrics.findTrainedModelAllocationCounts(metadataBuilder.build()); - assertThat(counts.trainedModelsTargetAllocations(), is(5)); - assertThat(counts.trainedModelsCurrentAllocations(), is(3)); + assertThat(counts.trainedModelsTargetAllocations(), is(6)); + assertThat(counts.trainedModelsCurrentAllocations(), is(4)); assertThat(counts.trainedModelsFailedAllocations(), is(1)); + assertThat(counts.trainedModelsFixedAllocations(), is(3)); + assertThat(counts.trainedModelsDisabledAdaptiveAllocations(), is(1)); } public void testFindNativeMemoryFree() { From fee6f83a4ff97dc592233560e0b79c0b5d29e113 Mon Sep 17 00:00:00 2001 From: Pat Whelan Date: Mon, 9 Jun 2025 15:02:57 -0400 Subject: [PATCH 2/4] Update docs/changelog/129161.yaml --- docs/changelog/129161.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/129161.yaml diff --git a/docs/changelog/129161.yaml b/docs/changelog/129161.yaml new file mode 100644 index 0000000000000..a871fff01c9d7 --- /dev/null +++ b/docs/changelog/129161.yaml @@ -0,0 +1,5 @@ +pr: 129161 +summary: Add Telemetry for models without adaptive allocations +area: Machine Learning +type: enhancement +issues: [] From d288c58a973ed282c34428d33c48406286182d68 Mon Sep 17 00:00:00 2001 From: Pat Whelan Date: Tue, 10 Jun 2025 12:48:26 -0400 Subject: [PATCH 3/4] Change to scales_to_zero to avoid high cardinality --- .../AdaptiveAllocationsScalerService.java | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java index 25f485be10add..a7812a5dfa0b3 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java @@ -40,7 +40,6 @@ import java.util.ArrayList; import java.util.Collection; -import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -185,15 +184,12 @@ Collection observeAllocationCount() { return scalers.values().stream().map(scaler -> { var value = scaler.getNumberOfAllocations(); var min = scaler.getMinNumberOfAllocations(); - var max = scaler.getMaxNumberOfAllocations(); + var scalesToZero = min == null || min == 0; - var attributes = new HashMap(3); - attributes.put("deployment_id", scaler.getDeploymentId()); - attributes.put("min_number_of_allocations", min != null ? min : 0); - if (max != null) { - attributes.put("max_number_of_allocations", max); - } - return new LongWithAttributes(value, Collections.unmodifiableMap(attributes)); + return new LongWithAttributes( + value, + Map.ofEntries(Map.entry("deployment_id", scaler.getDeploymentId()), Map.entry("scales_to_zero", scalesToZero)) + ); }).toList(); } } From 7dee3fe8609e1afcc2f05266c9310dfca790f6cd Mon Sep 17 00:00:00 2001 From: Pat Whelan Date: Fri, 13 Jun 2025 08:05:44 -0400 Subject: [PATCH 4/4] Count number of deployments --- .../org/elasticsearch/xpack/ml/MlMetrics.java | 22 +++++++++---------- .../xpack/ml/MlMetricsTests.java | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MlMetrics.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MlMetrics.java index 98ff3ec9944d3..d55ad014c5ede 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MlMetrics.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MlMetrics.java @@ -277,7 +277,7 @@ private void registerMasterNodeMetrics(MeterRegistry meterRegistry) { "es.ml.trained_models.deployment.fixed_allocations.current", "Sum of current trained model allocations that do not use adaptive allocations (either enabled or disabled)", "allocations", - () -> new LongWithAttributes(trainedModelAllocationCounts.trainedModelsFixedAllocations, isMasterMap) + () -> new LongWithAttributes(trainedModelAllocationCounts.deploymentsWithFixedAllocations, isMasterMap) ) ); /* @@ -288,7 +288,7 @@ private void registerMasterNodeMetrics(MeterRegistry meterRegistry) { "es.ml.trained_models.deployment.disabled_adaptive_allocations.current", "Sum of current trained model allocations that have adaptive allocations disabled", "allocations", - () -> new LongWithAttributes(trainedModelAllocationCounts.trainedModelsDisabledAdaptiveAllocations, isMasterMap) + () -> new LongWithAttributes(trainedModelAllocationCounts.deploymentsWithDisabledAdaptiveAllocations, isMasterMap) ) ); } @@ -503,19 +503,19 @@ static TrainedModelAllocationCounts findTrainedModelAllocationCounts(TrainedMode int trainedModelsTargetAllocations = 0; int trainedModelsCurrentAllocations = 0; int trainedModelsFailedAllocations = 0; - int trainedModelsFixedAllocations = 0; - int trainedModelsDisabledAdaptiveAllocations = 0; + int deploymentsWithFixedAllocations = 0; + int deploymentsWithDisabledAdaptiveAllocations = 0; for (TrainedModelAssignment trainedModelAssignment : metadata.allAssignments().values()) { trainedModelsTargetAllocations += trainedModelAssignment.totalTargetAllocations(); trainedModelsFailedAllocations += trainedModelAssignment.totalFailedAllocations(); - trainedModelsCurrentAllocations += trainedModelAssignment.totalCurrentAllocations(); + if (trainedModelAssignment.getAdaptiveAllocationsSettings() == null) { - trainedModelsFixedAllocations += trainedModelAssignment.totalCurrentAllocations(); + deploymentsWithFixedAllocations += 1; } else if ((trainedModelAssignment.getAdaptiveAllocationsSettings().getEnabled() == null) || (trainedModelAssignment.getAdaptiveAllocationsSettings().getEnabled() == false)) { - trainedModelsDisabledAdaptiveAllocations += trainedModelAssignment.totalCurrentAllocations(); + deploymentsWithDisabledAdaptiveAllocations += 1; } } @@ -523,8 +523,8 @@ static TrainedModelAllocationCounts findTrainedModelAllocationCounts(TrainedMode trainedModelsTargetAllocations, trainedModelsCurrentAllocations, trainedModelsFailedAllocations, - trainedModelsFixedAllocations, - trainedModelsDisabledAdaptiveAllocations + deploymentsWithFixedAllocations, + deploymentsWithDisabledAdaptiveAllocations ); } @@ -587,8 +587,8 @@ record TrainedModelAllocationCounts( int trainedModelsTargetAllocations, int trainedModelsCurrentAllocations, int trainedModelsFailedAllocations, - int trainedModelsFixedAllocations, - int trainedModelsDisabledAdaptiveAllocations + int deploymentsWithFixedAllocations, + int deploymentsWithDisabledAdaptiveAllocations ) { static final TrainedModelAllocationCounts EMPTY = new TrainedModelAllocationCounts(0, 0, 0, 0, 0); } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MlMetricsTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MlMetricsTests.java index 53a10771bfecc..60d9074959f5c 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MlMetricsTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/MlMetricsTests.java @@ -166,8 +166,8 @@ public void testFindTrainedModelAllocationCounts() { assertThat(counts.trainedModelsTargetAllocations(), is(6)); assertThat(counts.trainedModelsCurrentAllocations(), is(4)); assertThat(counts.trainedModelsFailedAllocations(), is(1)); - assertThat(counts.trainedModelsFixedAllocations(), is(3)); - assertThat(counts.trainedModelsDisabledAdaptiveAllocations(), is(1)); + assertThat(counts.deploymentsWithFixedAllocations(), is(3)); + assertThat(counts.deploymentsWithDisabledAdaptiveAllocations(), is(1)); } public void testFindNativeMemoryFree() {