Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/129161.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 129161
summary: Add Telemetry for models without adaptive allocations
area: Machine Learning
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,25 @@ private void registerMasterNodeMetrics(MeterRegistry meterRegistry) {
() -> new LongWithAttributes(trainedModelAllocationCounts.trainedModelsFailedAllocations, isMasterMap)
)
);
metrics.add(
meterRegistry.registerLongGauge(
"es.ml.trained_models.deployment.fixed_allocations.current",
"Sum of current trained model allocations that do not use adaptive allocations (either enabled or disabled)",
"allocations",
() -> new LongWithAttributes(trainedModelAllocationCounts.deploymentsWithFixedAllocations, isMasterMap)
)
);
/*
* AdaptiveAllocationsScalerService tracks the number of allocations with adaptive allocations enabled.
*/
metrics.add(
meterRegistry.registerLongGauge(
"es.ml.trained_models.deployment.disabled_adaptive_allocations.current",
"Sum of current trained model allocations that have adaptive allocations disabled",
"allocations",
() -> new LongWithAttributes(trainedModelAllocationCounts.deploymentsWithDisabledAdaptiveAllocations, isMasterMap)
)
);
}

@Override
Expand Down Expand Up @@ -484,17 +503,28 @@ static TrainedModelAllocationCounts findTrainedModelAllocationCounts(TrainedMode
int trainedModelsTargetAllocations = 0;
int trainedModelsCurrentAllocations = 0;
int trainedModelsFailedAllocations = 0;
int deploymentsWithFixedAllocations = 0;
int deploymentsWithDisabledAdaptiveAllocations = 0;

for (TrainedModelAssignment trainedModelAssignment : metadata.allAssignments().values()) {
trainedModelsTargetAllocations += trainedModelAssignment.totalTargetAllocations();
trainedModelsCurrentAllocations += trainedModelAssignment.totalCurrentAllocations();
trainedModelsFailedAllocations += trainedModelAssignment.totalFailedAllocations();
trainedModelsCurrentAllocations += trainedModelAssignment.totalCurrentAllocations();

if (trainedModelAssignment.getAdaptiveAllocationsSettings() == null) {
deploymentsWithFixedAllocations += 1;
} else if ((trainedModelAssignment.getAdaptiveAllocationsSettings().getEnabled() == null)
|| (trainedModelAssignment.getAdaptiveAllocationsSettings().getEnabled() == false)) {
deploymentsWithDisabledAdaptiveAllocations += 1;
}
}

return new TrainedModelAllocationCounts(
trainedModelsTargetAllocations,
trainedModelsCurrentAllocations,
trainedModelsFailedAllocations
trainedModelsFailedAllocations,
deploymentsWithFixedAllocations,
deploymentsWithDisabledAdaptiveAllocations
);
}

Expand Down Expand Up @@ -556,8 +586,10 @@ record MlTaskStatusCounts(
record TrainedModelAllocationCounts(
int trainedModelsTargetAllocations,
int trainedModelsCurrentAllocations,
int trainedModelsFailedAllocations
int trainedModelsFailedAllocations,
int deploymentsWithFixedAllocations,
int deploymentsWithDisabledAdaptiveAllocations
) {
static final TrainedModelAllocationCounts EMPTY = new TrainedModelAllocationCounts(0, 0, 0);
static final TrainedModelAllocationCounts EMPTY = new TrainedModelAllocationCounts(0, 0, 0, 0, 0);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -239,4 +239,12 @@ public Double getLastMeasuredInferenceTime() {
public Long getLastMeasuredQueueSize() {
return lastMeasuredQueueSize;
}

public Integer getMinNumberOfAllocations() {
return minNumberOfAllocations;
}

public Integer getMaxNumberOfAllocations() {
return maxNumberOfAllocations;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ void init() {
"es.ml.trained_models.adaptive_allocations.actual_number_of_allocations.current",
"the actual number of allocations",
"",
() -> observeLong(AdaptiveAllocationsScaler::getNumberOfAllocations)
this::observeAllocationCount
)
);
metrics.add(
Expand Down Expand Up @@ -179,6 +179,19 @@ Collection<DoubleWithAttributes> observeDouble(Function<AdaptiveAllocationsScale
}
return observations;
}

Collection<LongWithAttributes> observeAllocationCount() {
return scalers.values().stream().map(scaler -> {
var value = scaler.getNumberOfAllocations();
var min = scaler.getMinNumberOfAllocations();
var scalesToZero = min == null || min == 0;

return new LongWithAttributes(
value,
Map.ofEntries(Map.entry("deployment_id", scaler.getDeploymentId()), Map.entry("scales_to_zero", scalesToZero))
);
}).toList();
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.elasticsearch.xpack.core.ml.datafeed.DatafeedState;
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsState;
import org.elasticsearch.xpack.core.ml.inference.assignment.AdaptiveAllocationsSettings;
import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingInfo;
import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingState;
import org.elasticsearch.xpack.core.ml.inference.assignment.TrainedModelAssignment;
Expand Down Expand Up @@ -146,11 +147,27 @@ public void testFindTrainedModelAllocationCounts() {
TrainedModelAssignment.Builder.empty(mock(StartTrainedModelDeploymentAction.TaskParams.class), null)
.addRoutingEntry("node2", new RoutingInfo(0, 1, RoutingState.STARTING, ""))
);
metadataBuilder.addNewAssignment(
"model4",
TrainedModelAssignment.Builder.empty(
mock(StartTrainedModelDeploymentAction.TaskParams.class),
new AdaptiveAllocationsSettings(true, 0, 1)
).addRoutingEntry("node1", new RoutingInfo(0, 0, RoutingState.STARTING, ""))
);
metadataBuilder.addNewAssignment(
"model5",
TrainedModelAssignment.Builder.empty(
mock(StartTrainedModelDeploymentAction.TaskParams.class),
new AdaptiveAllocationsSettings(false, 1, 1)
).addRoutingEntry("node1", new RoutingInfo(1, 1, RoutingState.STARTING, ""))
);

MlMetrics.TrainedModelAllocationCounts counts = MlMetrics.findTrainedModelAllocationCounts(metadataBuilder.build());
assertThat(counts.trainedModelsTargetAllocations(), is(5));
assertThat(counts.trainedModelsCurrentAllocations(), is(3));
assertThat(counts.trainedModelsTargetAllocations(), is(6));
assertThat(counts.trainedModelsCurrentAllocations(), is(4));
assertThat(counts.trainedModelsFailedAllocations(), is(1));
assertThat(counts.deploymentsWithFixedAllocations(), is(3));
assertThat(counts.deploymentsWithDisabledAdaptiveAllocations(), is(1));
}

public void testFindNativeMemoryFree() {
Expand Down
Loading