|
| 1 | +/* |
| 2 | + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one |
| 3 | + * or more contributor license agreements. Licensed under the Elastic License |
| 4 | + * 2.0; you may not use this file except in compliance with the Elastic License |
| 5 | + * 2.0. |
| 6 | + */ |
| 7 | + |
| 8 | +package org.elasticsearch.xpack.ml.inference; |
| 9 | + |
| 10 | +import org.apache.logging.log4j.LogManager; |
| 11 | +import org.apache.logging.log4j.Logger; |
| 12 | +import org.elasticsearch.ElasticsearchStatusException; |
| 13 | +import org.elasticsearch.action.ActionListener; |
| 14 | +import org.elasticsearch.cluster.ClusterState; |
| 15 | +import org.elasticsearch.rest.RestStatus; |
| 16 | +import org.elasticsearch.tasks.TaskId; |
| 17 | +import org.elasticsearch.xpack.core.ml.action.InferModelAction; |
| 18 | +import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingInfo; |
| 19 | +import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingState; |
| 20 | +import org.elasticsearch.xpack.core.ml.inference.assignment.TrainedModelAssignment; |
| 21 | +import org.elasticsearch.xpack.core.ml.inference.assignment.TrainedModelAssignmentMetadata; |
| 22 | +import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper; |
| 23 | +import org.elasticsearch.xpack.ml.inference.assignment.TrainedModelAssignmentService; |
| 24 | + |
| 25 | +import java.util.HashMap; |
| 26 | +import java.util.Map; |
| 27 | +import java.util.concurrent.atomic.AtomicInteger; |
| 28 | +import java.util.concurrent.atomic.AtomicReference; |
| 29 | +import java.util.function.BiConsumer; |
| 30 | +import java.util.function.Predicate; |
| 31 | + |
| 32 | +import static org.elasticsearch.core.Strings.format; |
| 33 | + |
| 34 | +/** |
| 35 | + * Class for storing inference requests for ml trained models while |
| 36 | + * scaling is in progress. Once the trained model has at least 1 |
| 37 | + * allocation the stored requests are forwarded to a consumer for |
| 38 | + * processing.Requests will timeout while waiting for scale. |
| 39 | + */ |
| 40 | +public class InferenceWaitForAllocation { |
| 41 | + |
| 42 | + public static final int MAX_PENDING_REQUEST_COUNT = 100; |
| 43 | + |
| 44 | + /** |
| 45 | + * Track details of the pending request |
| 46 | + */ |
| 47 | + public record WaitingRequest( |
| 48 | + InferModelAction.Request request, |
| 49 | + InferModelAction.Response.Builder responseBuilder, |
| 50 | + TaskId parentTaskId, |
| 51 | + ActionListener<InferModelAction.Response> listener |
| 52 | + ) { |
| 53 | + public String deploymentId() { |
| 54 | + return request.getId(); |
| 55 | + } |
| 56 | + } |
| 57 | + |
| 58 | + private static final Logger logger = LogManager.getLogger(InferenceWaitForAllocation.class); |
| 59 | + |
| 60 | + private final TrainedModelAssignmentService assignmentService; |
| 61 | + private final BiConsumer<WaitingRequest, TrainedModelAssignment> queuedConsumer; |
| 62 | + private AtomicInteger pendingRequestCount = new AtomicInteger(); |
| 63 | + |
| 64 | + /** |
| 65 | + * Create with consumer of the successful requests |
| 66 | + * @param assignmentService Trained model assignment service |
| 67 | + * @param onInferenceScaledConsumer The consumer of the waiting request called once an |
| 68 | + * allocation is available. |
| 69 | + */ |
| 70 | + public InferenceWaitForAllocation( |
| 71 | + TrainedModelAssignmentService assignmentService, |
| 72 | + BiConsumer<WaitingRequest, TrainedModelAssignment> onInferenceScaledConsumer |
| 73 | + ) { |
| 74 | + this.assignmentService = assignmentService; |
| 75 | + this.queuedConsumer = onInferenceScaledConsumer; |
| 76 | + } |
| 77 | + |
| 78 | + /** |
| 79 | + * Wait for at least 1 allocation to be started then process the |
| 80 | + * inference request. |
| 81 | + * If the pending request count is greater than {@link #MAX_PENDING_REQUEST_COUNT} |
| 82 | + * the request listener is failed with a too many requests exception |
| 83 | + * The timeout is the inference request timeout. |
| 84 | + * @param request The inference request details |
| 85 | + */ |
| 86 | + public synchronized void waitForAssignment(WaitingRequest request) { |
| 87 | + logger.info("waitForAssignment will wait for condition"); |
| 88 | + if (pendingRequestCount.get() > MAX_PENDING_REQUEST_COUNT) { |
| 89 | + request.listener.onFailure( |
| 90 | + new ElasticsearchStatusException( |
| 91 | + "Rejected inference request waiting for an allocation of deployment [{}]. Too many pending requests", |
| 92 | + RestStatus.TOO_MANY_REQUESTS, |
| 93 | + request.request.getId() |
| 94 | + ) |
| 95 | + ); |
| 96 | + return; |
| 97 | + } |
| 98 | + |
| 99 | + pendingRequestCount.incrementAndGet(); |
| 100 | + var prediate = new DeploymentHasAtLeastOneAllocation(request.deploymentId()); |
| 101 | + |
| 102 | + assignmentService.waitForAssignmentCondition( |
| 103 | + request.deploymentId(), |
| 104 | + prediate, |
| 105 | + request.request().getInferenceTimeout(), |
| 106 | + new WaitingListener(request.deploymentId(), request, prediate) |
| 107 | + ); |
| 108 | + } |
| 109 | + |
| 110 | + private static class DeploymentHasAtLeastOneAllocation implements Predicate<ClusterState> { |
| 111 | + |
| 112 | + private final String deploymentId; |
| 113 | + private AtomicReference<Exception> exception = new AtomicReference<>(); |
| 114 | + |
| 115 | + DeploymentHasAtLeastOneAllocation(String deploymentId) { |
| 116 | + this.deploymentId = ExceptionsHelper.requireNonNull(deploymentId, "deployment_id"); |
| 117 | + } |
| 118 | + |
| 119 | + @Override |
| 120 | + public boolean test(ClusterState clusterState) { |
| 121 | + logger.info("predicate test"); |
| 122 | + TrainedModelAssignment trainedModelAssignment = TrainedModelAssignmentMetadata.assignmentForDeploymentId( |
| 123 | + clusterState, |
| 124 | + deploymentId |
| 125 | + ).orElse(null); |
| 126 | + if (trainedModelAssignment == null) { |
| 127 | + logger.info(() -> format("[%s] assignment was null while waiting to scale up", deploymentId)); |
| 128 | + return false; |
| 129 | + } |
| 130 | + |
| 131 | + Map<String, String> nodeFailuresAndReasons = new HashMap<>(); |
| 132 | + for (var nodeIdAndRouting : trainedModelAssignment.getNodeRoutingTable().entrySet()) { |
| 133 | + if (RoutingState.FAILED.equals(nodeIdAndRouting.getValue().getState())) { |
| 134 | + nodeFailuresAndReasons.put(nodeIdAndRouting.getKey(), nodeIdAndRouting.getValue().getReason()); |
| 135 | + } |
| 136 | + } |
| 137 | + if (nodeFailuresAndReasons.isEmpty() == false) { |
| 138 | + if (nodeFailuresAndReasons.size() == trainedModelAssignment.getNodeRoutingTable().size()) { |
| 139 | + exception.set( |
| 140 | + new ElasticsearchStatusException( |
| 141 | + "[{}] Error waiting for a model allocation, all nodes have failed with errors [{}]", |
| 142 | + RestStatus.INTERNAL_SERVER_ERROR, |
| 143 | + trainedModelAssignment.getDeploymentId(), |
| 144 | + nodeFailuresAndReasons |
| 145 | + ) |
| 146 | + ); |
| 147 | + return true; // don't try again |
| 148 | + } else { |
| 149 | + logger.warn("Deployment [{}] has failed routes [{}]", trainedModelAssignment.getDeploymentId(), nodeFailuresAndReasons); |
| 150 | + } |
| 151 | + } |
| 152 | + |
| 153 | + var routable = trainedModelAssignment.getNodeRoutingTable().values().stream().filter(RoutingInfo::isRoutable).findFirst(); |
| 154 | + if (routable.isPresent()) { |
| 155 | + logger.info("first route " + routable.get() + ", state" + trainedModelAssignment.calculateAllocationStatus()); |
| 156 | + } else { |
| 157 | + logger.info("no routes"); |
| 158 | + } |
| 159 | + |
| 160 | + return routable.isPresent(); |
| 161 | + } |
| 162 | + } |
| 163 | + |
| 164 | + private class WaitingListener implements TrainedModelAssignmentService.WaitForAssignmentListener { |
| 165 | + |
| 166 | + private final String deploymentId; |
| 167 | + private final WaitingRequest request; |
| 168 | + private final DeploymentHasAtLeastOneAllocation predicate; |
| 169 | + |
| 170 | + private WaitingListener(String deploymentId, WaitingRequest request, DeploymentHasAtLeastOneAllocation predicate) { |
| 171 | + this.deploymentId = deploymentId; |
| 172 | + this.request = request; |
| 173 | + this.predicate = predicate; |
| 174 | + } |
| 175 | + |
| 176 | + @Override |
| 177 | + public void onResponse(TrainedModelAssignment assignment) { |
| 178 | + // assignment is started, do inference |
| 179 | + pendingRequestCount.decrementAndGet(); |
| 180 | + |
| 181 | + if (predicate.exception.get() != null) { |
| 182 | + onFailure(predicate.exception.get()); |
| 183 | + return; |
| 184 | + } |
| 185 | + |
| 186 | + logger.info("sending waited request"); |
| 187 | + queuedConsumer.accept(request, assignment); |
| 188 | + } |
| 189 | + |
| 190 | + @Override |
| 191 | + public void onFailure(Exception e) { |
| 192 | + logger.info("failed waiting", e); |
| 193 | + pendingRequestCount.decrementAndGet(); |
| 194 | + request.listener().onFailure(e); |
| 195 | + } |
| 196 | + } |
| 197 | +} |
0 commit comments