-
Notifications
You must be signed in to change notification settings - Fork 25.6k
[ML] Fix test failure updating model deployment with stale cluster state. #128667
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -11,8 +11,6 @@ | |||
| import org.apache.logging.log4j.Logger; | ||||
| import org.elasticsearch.ElasticsearchStatusException; | ||||
| import org.elasticsearch.ResourceNotFoundException; | ||||
| import org.elasticsearch.TransportVersion; | ||||
| import org.elasticsearch.TransportVersions; | ||||
| import org.elasticsearch.action.ActionListener; | ||||
| import org.elasticsearch.action.ActionRequestValidationException; | ||||
| import org.elasticsearch.action.support.master.AcknowledgedResponse; | ||||
|
|
@@ -79,9 +77,6 @@ public class TrainedModelAssignmentClusterService implements ClusterStateListene | |||
|
|
||||
| private static final Logger logger = LogManager.getLogger(TrainedModelAssignmentClusterService.class); | ||||
|
|
||||
| private static final TransportVersion RENAME_ALLOCATION_TO_ASSIGNMENT_TRANSPORT_VERSION = TransportVersions.V_8_3_0; | ||||
| public static final TransportVersion DISTRIBUTED_MODEL_ALLOCATION_TRANSPORT_VERSION = TransportVersions.V_8_4_0; | ||||
|
|
||||
| private final ClusterService clusterService; | ||||
| private final ThreadPool threadPool; | ||||
| private final NodeLoadDetector nodeLoadDetector; | ||||
|
|
@@ -169,14 +164,6 @@ public void clusterChanged(ClusterChangedEvent event) { | |||
| return; | ||||
| } | ||||
|
|
||||
| if (eventStateMinTransportVersionIsBeforeDistributedModelAllocationTransportVersion(event)) { | ||||
| // we should not try to rebalance assignments while there may be nodes running on a version | ||||
| // prior to introducing distributed model allocation. | ||||
| // But we should remove routing to removed or shutting down nodes. | ||||
| removeRoutingToRemovedOrShuttingDownNodes(event); | ||||
| return; | ||||
| } | ||||
|
|
||||
| if (event.nodesAdded()) { | ||||
| logMlNodeHeterogeneity(); | ||||
| } | ||||
|
|
@@ -203,10 +190,6 @@ public void clusterChanged(ClusterChangedEvent event) { | |||
| } | ||||
| } | ||||
|
|
||||
| boolean eventStateMinTransportVersionIsBeforeDistributedModelAllocationTransportVersion(ClusterChangedEvent event) { | ||||
| return event.state().getMinTransportVersion().before(DISTRIBUTED_MODEL_ALLOCATION_TRANSPORT_VERSION); | ||||
| } | ||||
|
|
||||
| boolean eventStateHasGlobalBlockStateNotRecoveredBlock(ClusterChangedEvent event) { | ||||
| return event.state().blocks().hasGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK); | ||||
| } | ||||
|
|
@@ -400,18 +383,6 @@ public void createNewModelAssignment( | |||
| CreateTrainedModelAssignmentAction.Request request, | ||||
| ActionListener<TrainedModelAssignment> listener | ||||
| ) { | ||||
| if (clusterService.state().getMinTransportVersion().before(DISTRIBUTED_MODEL_ALLOCATION_TRANSPORT_VERSION)) { | ||||
| listener.onFailure( | ||||
| new ElasticsearchStatusException( | ||||
| "cannot create new assignment [{}] for model [{}] while cluster upgrade is in progress", | ||||
| RestStatus.CONFLICT, | ||||
| request.getTaskParams().getDeploymentId(), | ||||
| request.getTaskParams().getModelId() | ||||
| ) | ||||
| ); | ||||
| return; | ||||
| } | ||||
|
|
||||
| if (MlMetadata.getMlMetadata(clusterService.state()).isResetMode()) { | ||||
| listener.onFailure( | ||||
| new ElasticsearchStatusException( | ||||
|
|
@@ -522,13 +493,11 @@ private static ClusterState update(ClusterState currentState, TrainedModelAssign | |||
|
|
||||
| private static ClusterState forceUpdate(ClusterState currentState, TrainedModelAssignmentMetadata.Builder modelAssignments) { | ||||
| logger.debug(() -> format("updated assignments: %s", modelAssignments.build())); | ||||
|
|
||||
| ProjectMetadata.Builder builder = ProjectMetadata.builder(currentState.metadata().getProject()); | ||||
| if (currentState.getMinTransportVersion().onOrAfter(RENAME_ALLOCATION_TO_ASSIGNMENT_TRANSPORT_VERSION)) { | ||||
| builder.putCustom(TrainedModelAssignmentMetadata.NAME, modelAssignments.build()) | ||||
| .removeCustom(TrainedModelAssignmentMetadata.DEPRECATED_NAME); | ||||
| } else { | ||||
| builder.putCustom(TrainedModelAssignmentMetadata.DEPRECATED_NAME, modelAssignments.buildOld()); | ||||
| } | ||||
| builder.putCustom(TrainedModelAssignmentMetadata.NAME, modelAssignments.build()) | ||||
| .removeCustom(TrainedModelAssignmentMetadata.DEPRECATED_NAME); | ||||
|
|
||||
| return ClusterState.builder(currentState).putProjectMetadata(builder).build(); | ||||
| } | ||||
|
|
||||
|
|
@@ -844,7 +813,7 @@ private void updateDeployment( | |||
| } | ||||
| boolean hasUpdates = hasUpdates(numberOfAllocations, adaptiveAllocationsSettingsUpdates, existingAssignment); | ||||
| if (hasUpdates == false) { | ||||
| logger.info("no updates"); | ||||
| logger.debug("no updates to be made for deployment [{}]", deploymentId); | ||||
| listener.onResponse(existingAssignment); | ||||
| return; | ||||
| } | ||||
|
|
@@ -858,27 +827,17 @@ private void updateDeployment( | |||
| ); | ||||
| return; | ||||
| } | ||||
| if (clusterState.getMinTransportVersion().before(DISTRIBUTED_MODEL_ALLOCATION_TRANSPORT_VERSION)) { | ||||
| listener.onFailure( | ||||
| new ElasticsearchStatusException( | ||||
| "cannot update deployment with model id [{}] while cluster upgrade is in progress.", | ||||
| RestStatus.CONFLICT, | ||||
| deploymentId | ||||
| ) | ||||
| ); | ||||
| return; | ||||
| } | ||||
|
|
||||
| ActionListener<ClusterState> updatedStateListener = ActionListener.wrap( | ||||
| updatedState -> submitUnbatchedTask("update model deployment", new ClusterStateUpdateTask() { | ||||
| ActionListener<TrainedModelAssignmentMetadata.Builder> updatedAssignmentListener = ActionListener.wrap( | ||||
| updatedAssignment -> submitUnbatchedTask("update model deployment", new ClusterStateUpdateTask() { | ||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the fix, here the new assignment state is passed rather than the updated cluster state. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if the That's a similar bug to the existing one, right? But much smaller, because the Should we protect against that? For example, only replace the Maybe that's overkill and too complicated though. WDYT? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we're deciding not to fix this, let's leave a comment about this small issue with this implementation and call it a day There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking at the Line 610 in ffd02c5
By comparing the starting state with the latest state before applying the updated There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, cool! I missed that part of the code. Guess this all works then as is |
||||
|
|
||||
| private volatile boolean isUpdated; | ||||
|
|
||||
| @Override | ||||
| public ClusterState execute(ClusterState currentState) { | ||||
| if (areClusterStatesCompatibleForRebalance(clusterState, currentState)) { | ||||
| isUpdated = true; | ||||
| return updatedState; | ||||
| return update(currentState, updatedAssignment); | ||||
| } | ||||
| logger.debug(() -> format("[%s] Retrying update as cluster state has been modified", deploymentId)); | ||||
| updateDeployment(currentState, deploymentId, numberOfAllocations, adaptiveAllocationsSettings, isInternal, listener); | ||||
|
|
@@ -910,7 +869,7 @@ public void clusterStateProcessed(ClusterState oldState, ClusterState newState) | |||
| listener::onFailure | ||||
| ); | ||||
|
|
||||
| updateAssignment(clusterState, existingAssignment, numberOfAllocations, adaptiveAllocationsSettings, updatedStateListener); | ||||
| updateAssignment(clusterState, existingAssignment, numberOfAllocations, adaptiveAllocationsSettings, updatedAssignmentListener); | ||||
| } | ||||
|
|
||||
| static boolean hasUpdates( | ||||
|
|
@@ -944,7 +903,7 @@ private void updateAssignment( | |||
| TrainedModelAssignment assignment, | ||||
| Integer numberOfAllocations, | ||||
| AdaptiveAllocationsSettings adaptiveAllocationsSettings, | ||||
| ActionListener<ClusterState> listener | ||||
| ActionListener<TrainedModelAssignmentMetadata.Builder> listener | ||||
| ) { | ||||
| threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME).execute(() -> { | ||||
| if (numberOfAllocations == null || numberOfAllocations == assignment.getTaskParams().getNumberOfAllocations()) { | ||||
|
|
@@ -961,21 +920,21 @@ private void updateAndKeepNumberOfAllocations( | |||
| ClusterState clusterState, | ||||
| TrainedModelAssignment assignment, | ||||
| AdaptiveAllocationsSettings adaptiveAllocationsSettings, | ||||
| ActionListener<ClusterState> listener | ||||
| ActionListener<TrainedModelAssignmentMetadata.Builder> listener | ||||
| ) { | ||||
| TrainedModelAssignment.Builder updatedAssignment = TrainedModelAssignment.Builder.fromAssignment(assignment) | ||||
| .setAdaptiveAllocationsSettings(adaptiveAllocationsSettings); | ||||
| TrainedModelAssignmentMetadata.Builder builder = TrainedModelAssignmentMetadata.builder(clusterState); | ||||
| builder.updateAssignment(assignment.getDeploymentId(), updatedAssignment); | ||||
| listener.onResponse(update(clusterState, builder)); | ||||
| listener.onResponse(builder); | ||||
| } | ||||
|
|
||||
| private void increaseNumberOfAllocations( | ||||
| ClusterState clusterState, | ||||
| TrainedModelAssignment assignment, | ||||
| int numberOfAllocations, | ||||
| AdaptiveAllocationsSettings adaptiveAllocationsSettings, | ||||
| ActionListener<ClusterState> listener | ||||
| ActionListener<TrainedModelAssignmentMetadata.Builder> listener | ||||
| ) { | ||||
| try { | ||||
| TrainedModelAssignment.Builder updatedAssignment = TrainedModelAssignment.Builder.fromAssignment(assignment) | ||||
|
|
@@ -995,7 +954,7 @@ private void increaseNumberOfAllocations( | |||
| ) | ||||
| ); | ||||
| } else { | ||||
| listener.onResponse(update(clusterState, rebalancedMetadata)); | ||||
| listener.onResponse(rebalancedMetadata); | ||||
| } | ||||
| } catch (Exception e) { | ||||
| listener.onFailure(e); | ||||
|
|
@@ -1007,7 +966,7 @@ private void decreaseNumberOfAllocations( | |||
| TrainedModelAssignment assignment, | ||||
| int numberOfAllocations, | ||||
| AdaptiveAllocationsSettings adaptiveAllocationsSettings, | ||||
| ActionListener<ClusterState> listener | ||||
| ActionListener<TrainedModelAssignmentMetadata.Builder> listener | ||||
| ) { | ||||
| TrainedModelAssignment.Builder updatedAssignment = numberOfAllocations < assignment.totalTargetAllocations() | ||||
| ? new AllocationReducer(assignment, nodeAvailabilityZoneMapper.buildMlNodesByAvailabilityZone(clusterState)).reduceTo( | ||||
|
|
@@ -1022,7 +981,7 @@ private void decreaseNumberOfAllocations( | |||
| } | ||||
| TrainedModelAssignmentMetadata.Builder builder = TrainedModelAssignmentMetadata.builder(clusterState); | ||||
| builder.updateAssignment(assignment.getDeploymentId(), updatedAssignment); | ||||
| listener.onResponse(update(clusterState, builder)); | ||||
| listener.onResponse(builder); | ||||
| } | ||||
|
|
||||
| static ClusterState setToStopping(ClusterState clusterState, String deploymentId, String reason) { | ||||
|
|
||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -375,20 +375,17 @@ public void clusterChanged(ClusterChangedEvent event) { | |
| final boolean isResetMode = MlMetadata.getMlMetadata(event.state()).isResetMode(); | ||
| TrainedModelAssignmentMetadata modelAssignmentMetadata = TrainedModelAssignmentMetadata.fromState(event.state()); | ||
| final String currentNode = event.state().nodes().getLocalNodeId(); | ||
| final boolean isNewAllocationSupported = event.state() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another version change that is irrelevant for 9 |
||
| .getMinTransportVersion() | ||
| .onOrAfter(TrainedModelAssignmentClusterService.DISTRIBUTED_MODEL_ALLOCATION_TRANSPORT_VERSION); | ||
| final Set<String> shuttingDownNodes = Collections.unmodifiableSet(event.state().metadata().nodeShutdowns().getAllNodeIds()); | ||
|
|
||
| if (isResetMode == false && isNewAllocationSupported) { | ||
| if (isResetMode == false) { | ||
| updateNumberOfAllocations(modelAssignmentMetadata); | ||
| } | ||
|
|
||
| for (TrainedModelAssignment trainedModelAssignment : modelAssignmentMetadata.allAssignments().values()) { | ||
| RoutingInfo routingInfo = trainedModelAssignment.getNodeRoutingTable().get(currentNode); | ||
| if (routingInfo != null) { | ||
| // Add new models to start loading if the assignment is not stopping | ||
| if (isNewAllocationSupported && trainedModelAssignment.getAssignmentState() != AssignmentState.STOPPING) { | ||
| if (trainedModelAssignment.getAssignmentState() != AssignmentState.STOPPING) { | ||
| if (shouldAssignmentBeRestarted(routingInfo, trainedModelAssignment.getDeploymentId())) { | ||
| prepareAssignmentForRestart(trainedModelAssignment); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These version checks are redundant in 9.0 and 9.1. The 8.x backports will need to keep them however.