Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/129161.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 129161
summary: Add Telemetry for models without adaptive allocations
area: Machine Learning
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,25 @@ private void registerMasterNodeMetrics(MeterRegistry meterRegistry) {
() -> new LongWithAttributes(trainedModelAllocationCounts.trainedModelsFailedAllocations, isMasterMap)
)
);
metrics.add(
meterRegistry.registerLongGauge(
"es.ml.trained_models.deployment.fixed_allocations.current",
"Sum of current trained model allocations that do not use adaptive allocations (either enabled or disabled)",
"allocations",
() -> new LongWithAttributes(trainedModelAllocationCounts.trainedModelsFixedAllocations, isMasterMap)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can the project type be added to the attribute map? If there are different rules for different project types it would be useful to split the data that way

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so? It looks like it comes from serverless.project_type which isn't available here. We could move this metric to serverless, or we can use ES|QL magic to pull in the project type from other metrics via the project id.

It's possible this will get automatically added when running in serverless.

)
);
/*
* AdaptiveAllocationsScalerService tracks the number of allocations with adaptive allocations enabled.
*/
metrics.add(
meterRegistry.registerLongGauge(
"es.ml.trained_models.deployment.disabled_adaptive_allocations.current",
"Sum of current trained model allocations that have adaptive allocations disabled",
"allocations",
() -> new LongWithAttributes(trainedModelAllocationCounts.trainedModelsDisabledAdaptiveAllocations, isMasterMap)
)
);
}

@Override
Expand Down Expand Up @@ -484,17 +503,28 @@ static TrainedModelAllocationCounts findTrainedModelAllocationCounts(TrainedMode
int trainedModelsTargetAllocations = 0;
int trainedModelsCurrentAllocations = 0;
int trainedModelsFailedAllocations = 0;
int trainedModelsFixedAllocations = 0;
int trainedModelsDisabledAdaptiveAllocations = 0;

for (TrainedModelAssignment trainedModelAssignment : metadata.allAssignments().values()) {
trainedModelsTargetAllocations += trainedModelAssignment.totalTargetAllocations();
trainedModelsCurrentAllocations += trainedModelAssignment.totalCurrentAllocations();
trainedModelsFailedAllocations += trainedModelAssignment.totalFailedAllocations();

trainedModelsCurrentAllocations += trainedModelAssignment.totalCurrentAllocations();
if (trainedModelAssignment.getAdaptiveAllocationsSettings() == null) {
trainedModelsFixedAllocations += trainedModelAssignment.totalCurrentAllocations();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here and in line 518 the code is summing the number of allocations from all deployments that do not use adaptive allocations. A single deployment could have 10 allocations and we wouldn't know if the user has 10 deployments with 1 allocation or 1 deployment with 10.

I think counting the number of deployments would be more meaningful

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that is a good point, we can just do an easy +1 to count the deployments

} else if ((trainedModelAssignment.getAdaptiveAllocationsSettings().getEnabled() == null)
|| (trainedModelAssignment.getAdaptiveAllocationsSettings().getEnabled() == false)) {
trainedModelsDisabledAdaptiveAllocations += trainedModelAssignment.totalCurrentAllocations();
}
}

return new TrainedModelAllocationCounts(
trainedModelsTargetAllocations,
trainedModelsCurrentAllocations,
trainedModelsFailedAllocations
trainedModelsFailedAllocations,
trainedModelsFixedAllocations,
trainedModelsDisabledAdaptiveAllocations
);
}

Expand Down Expand Up @@ -556,8 +586,10 @@ record MlTaskStatusCounts(
record TrainedModelAllocationCounts(
int trainedModelsTargetAllocations,
int trainedModelsCurrentAllocations,
int trainedModelsFailedAllocations
int trainedModelsFailedAllocations,
int trainedModelsFixedAllocations,
int trainedModelsDisabledAdaptiveAllocations
) {
static final TrainedModelAllocationCounts EMPTY = new TrainedModelAllocationCounts(0, 0, 0);
static final TrainedModelAllocationCounts EMPTY = new TrainedModelAllocationCounts(0, 0, 0, 0, 0);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -239,4 +239,12 @@ public Double getLastMeasuredInferenceTime() {
public Long getLastMeasuredQueueSize() {
return lastMeasuredQueueSize;
}

public Integer getMinNumberOfAllocations() {
return minNumberOfAllocations;
}

public Integer getMaxNumberOfAllocations() {
return maxNumberOfAllocations;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ void init() {
"es.ml.trained_models.adaptive_allocations.actual_number_of_allocations.current",
"the actual number of allocations",
"",
() -> observeLong(AdaptiveAllocationsScaler::getNumberOfAllocations)
this::observeAllocationCount
)
);
metrics.add(
Expand Down Expand Up @@ -179,6 +179,19 @@ Collection<DoubleWithAttributes> observeDouble(Function<AdaptiveAllocationsScale
}
return observations;
}

Collection<LongWithAttributes> observeAllocationCount() {
return scalers.values().stream().map(scaler -> {
var value = scaler.getNumberOfAllocations();
var min = scaler.getMinNumberOfAllocations();
var scalesToZero = min == null || min == 0;

return new LongWithAttributes(
value,
Map.ofEntries(Map.entry("deployment_id", scaler.getDeploymentId()), Map.entry("scales_to_zero", scalesToZero))
);
}).toList();
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.elasticsearch.xpack.core.ml.datafeed.DatafeedState;
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsConfig;
import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsState;
import org.elasticsearch.xpack.core.ml.inference.assignment.AdaptiveAllocationsSettings;
import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingInfo;
import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingState;
import org.elasticsearch.xpack.core.ml.inference.assignment.TrainedModelAssignment;
Expand Down Expand Up @@ -146,11 +147,27 @@ public void testFindTrainedModelAllocationCounts() {
TrainedModelAssignment.Builder.empty(mock(StartTrainedModelDeploymentAction.TaskParams.class), null)
.addRoutingEntry("node2", new RoutingInfo(0, 1, RoutingState.STARTING, ""))
);
metadataBuilder.addNewAssignment(
"model4",
TrainedModelAssignment.Builder.empty(
mock(StartTrainedModelDeploymentAction.TaskParams.class),
new AdaptiveAllocationsSettings(true, 0, 1)
).addRoutingEntry("node1", new RoutingInfo(0, 0, RoutingState.STARTING, ""))
);
metadataBuilder.addNewAssignment(
"model5",
TrainedModelAssignment.Builder.empty(
mock(StartTrainedModelDeploymentAction.TaskParams.class),
new AdaptiveAllocationsSettings(false, 1, 1)
).addRoutingEntry("node1", new RoutingInfo(1, 1, RoutingState.STARTING, ""))
);

MlMetrics.TrainedModelAllocationCounts counts = MlMetrics.findTrainedModelAllocationCounts(metadataBuilder.build());
assertThat(counts.trainedModelsTargetAllocations(), is(5));
assertThat(counts.trainedModelsCurrentAllocations(), is(3));
assertThat(counts.trainedModelsTargetAllocations(), is(6));
assertThat(counts.trainedModelsCurrentAllocations(), is(4));
assertThat(counts.trainedModelsFailedAllocations(), is(1));
assertThat(counts.trainedModelsFixedAllocations(), is(3));
assertThat(counts.trainedModelsDisabledAdaptiveAllocations(), is(1));
}

public void testFindNativeMemoryFree() {
Expand Down
Loading