Skip to content

Commit 8639ddf

Browse files
authored
[ML] Release cluster state (#136769) (#136835)
Refactoring TransportMlMemoryAction to not retain cluster state through the lifecycle of the request so that it can be garbage collected as soon as possible. Fix #123243
1 parent ad75481 commit 8639ddf

File tree

3 files changed

+39
-6
lines changed

3 files changed

+39
-6
lines changed

docs/changelog/136769.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 136769
2+
summary: Release cluster state
3+
area: Machine Learning
4+
type: bug
5+
issues:
6+
- 123243

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportMlMemoryAction.java

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import org.elasticsearch.client.internal.Client;
1717
import org.elasticsearch.client.internal.OriginSettingClient;
1818
import org.elasticsearch.client.internal.ParentTaskAssigningClient;
19+
import org.elasticsearch.cluster.ClusterName;
1920
import org.elasticsearch.cluster.ClusterState;
2021
import org.elasticsearch.cluster.block.ClusterBlockException;
2122
import org.elasticsearch.cluster.block.ClusterBlockLevel;
@@ -35,6 +36,7 @@
3536
import org.elasticsearch.xpack.core.ml.action.MlMemoryAction.Response.MlMemoryStats;
3637
import org.elasticsearch.xpack.core.ml.action.TrainedModelCacheInfoAction;
3738
import org.elasticsearch.xpack.core.ml.action.TrainedModelCacheInfoAction.Response.CacheInfo;
39+
import org.elasticsearch.xpack.core.ml.inference.assignment.TrainedModelAssignmentMetadata;
3840
import org.elasticsearch.xpack.ml.job.NodeLoad;
3941
import org.elasticsearch.xpack.ml.job.NodeLoadDetector;
4042
import org.elasticsearch.xpack.ml.process.MlMemoryTracker;
@@ -88,6 +90,9 @@ protected void masterOperation(
8890

8991
ClusterSettings clusterSettings = clusterService.getClusterSettings();
9092

93+
var clusterName = state.getClusterName();
94+
var trainedModelAssignmentMetadata = TrainedModelAssignmentMetadata.fromState(state);
95+
PersistentTasksCustomMetadata persistentTasksCustomMetadata = state.getMetadata().custom(PersistentTasksCustomMetadata.TYPE);
9196
// Resolve the node specification to some concrete nodes
9297
String[] nodeIds = state.nodes().resolveNodes(request.getNodeId());
9398

@@ -116,7 +121,9 @@ protected void masterOperation(
116121
trainedModelCacheInfoRequest,
117122
delegate2.delegateFailureAndWrap(
118123
(l, trainedModelCacheInfoResponse) -> handleResponses(
119-
state,
124+
clusterName,
125+
persistentTasksCustomMetadata,
126+
trainedModelAssignmentMetadata,
120127
clusterSettings,
121128
nodesStatsResponse,
122129
trainedModelCacheInfoResponse,
@@ -131,12 +138,14 @@ protected void masterOperation(
131138
if (memoryTracker.isEverRefreshed()) {
132139
memoryTrackerRefreshListener.onResponse(null);
133140
} else {
134-
memoryTracker.refresh(state.getMetadata().custom(PersistentTasksCustomMetadata.TYPE), memoryTrackerRefreshListener);
141+
memoryTracker.refresh(persistentTasksCustomMetadata, memoryTrackerRefreshListener);
135142
}
136143
}
137144

138145
void handleResponses(
139-
ClusterState state,
146+
ClusterName clusterName,
147+
PersistentTasksCustomMetadata persistentTasks,
148+
TrainedModelAssignmentMetadata assignmentMetadata,
140149
ClusterSettings clusterSettings,
141150
NodesStatsResponse nodesStatsResponse,
142151
TrainedModelCacheInfoAction.Response trainedModelCacheInfoResponse,
@@ -174,7 +183,8 @@ void handleResponses(
174183
ByteSizeValue mlNativeInference;
175184
if (node.getRoles().contains(DiscoveryNodeRole.ML_ROLE)) {
176185
NodeLoad nodeLoad = nodeLoadDetector.detectNodeLoad(
177-
state,
186+
persistentTasks,
187+
assignmentMetadata,
178188
node,
179189
maxOpenJobsPerNode,
180190
maxMachineMemoryPercent,
@@ -220,7 +230,7 @@ void handleResponses(
220230
);
221231
}
222232

223-
listener.onResponse(new MlMemoryAction.Response(state.getClusterName(), nodeResponses, failures));
233+
listener.onResponse(new MlMemoryAction.Response(clusterName, nodeResponses, failures));
224234
}
225235

226236
@Override

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/NodeLoadDetector.java

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,24 @@ public NodeLoad detectNodeLoad(
8787
int maxMachineMemoryPercent,
8888
boolean useAutoMachineMemoryCalculation
8989
) {
90-
PersistentTasksCustomMetadata persistentTasks = clusterState.getMetadata().custom(PersistentTasksCustomMetadata.TYPE);
90+
return detectNodeLoad(
91+
clusterState.getMetadata().custom(PersistentTasksCustomMetadata.TYPE),
92+
assignmentMetadata,
93+
node,
94+
maxNumberOfOpenJobs,
95+
maxMachineMemoryPercent,
96+
useAutoMachineMemoryCalculation
97+
);
98+
}
99+
100+
public NodeLoad detectNodeLoad(
101+
PersistentTasksCustomMetadata persistentTasks,
102+
TrainedModelAssignmentMetadata assignmentMetadata,
103+
DiscoveryNode node,
104+
int maxNumberOfOpenJobs,
105+
int maxMachineMemoryPercent,
106+
boolean useAutoMachineMemoryCalculation
107+
) {
91108
Map<String, String> nodeAttributes = node.getAttributes();
92109
List<String> errors = new ArrayList<>();
93110
OptionalLong maxMlMemory = NativeMemoryCalculator.allowedBytesForMl(node, maxMachineMemoryPercent, useAutoMachineMemoryCalculation);

0 commit comments

Comments
 (0)