the rest

davidkyle · davidkyle · commit 1d86ce831a75 · 2025-09-12T13:09:21.000+01:00
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/TransportUnifiedCompletionInferenceAction.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/TransportUnifiedCompletionInferenceAction.java
@@ -27,7 +27,6 @@
 import org.elasticsearch.xpack.core.inference.action.UnifiedCompletionAction;
 import org.elasticsearch.xpack.core.inference.results.UnifiedChatCompletionException;
 import org.elasticsearch.xpack.inference.action.task.StreamingTaskManager;
-import org.elasticsearch.xpack.inference.common.InferenceServiceRateLimitCalculator;
 import org.elasticsearch.xpack.inference.registry.InferenceEndpointRegistry;
 
 import java.util.concurrent.Flow;
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/external/http/sender/RequestExecutorService.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/external/http/sender/RequestExecutorService.java
@@ -97,8 +97,6 @@ interface RateLimiterCreator {
     private static final TimeValue RATE_LIMIT_GROUP_CLEANUP_INTERVAL = TimeValue.timeValueDays(1);
 
     private final ConcurrentMap<Object, RateLimitingEndpointHandler> rateLimitGroupings = new ConcurrentHashMap<>();
-    // TODO: add one atomic integer (number of nodes); also explain the assumption and why this works
-    // TODO: document that this impacts chat completion (and increase the default rate limit)
     private final AtomicInteger rateLimitDivisor = new AtomicInteger(1);
     private final ThreadPool threadPool;
     private final CountDownLatch startupLatch;
@@ -404,10 +402,6 @@ public void init() {
         }
 
         /**
-         * This method is solely called by {@link InferenceServiceNodeLocalRateLimitCalculator} to update
-         * rate limits, so they're "node-local".
-         * The general idea is described in {@link InferenceServiceNodeLocalRateLimitCalculator} in more detail.
-         *
          * @param divisor - divisor to divide the initial requests per time unit by
          */
         private synchronized void updateTokensPerTimeUnit(Integer divisor) {
diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/BaseTransportInferenceActionTestCase.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/BaseTransportInferenceActionTestCase.java
@@ -32,7 +32,6 @@
 import org.elasticsearch.xpack.core.inference.action.InferenceAction;
 import org.elasticsearch.xpack.inference.InferencePlugin;
 import org.elasticsearch.xpack.inference.action.task.StreamingTaskManager;
-import org.elasticsearch.xpack.inference.common.InferenceServiceRateLimitCalculator;
 import org.elasticsearch.xpack.inference.registry.InferenceEndpointRegistry;
 import org.junit.Before;
 import org.mockito.ArgumentCaptor;
@@ -68,7 +67,6 @@ public abstract class BaseTransportInferenceActionTestCase<Request extends BaseI
     protected static final String localNodeId = "local-node-id";
     protected InferenceServiceRegistry serviceRegistry;
     protected InferenceStats inferenceStats;
-    protected InferenceServiceRateLimitCalculator inferenceServiceRateLimitCalculator;
     protected TransportService transportService;
     protected NodeClient nodeClient;
 
@@ -83,7 +81,6 @@ public void setUp() throws Exception {
         threadPool = mock();
         nodeClient = mock();
         transportService = mock();
-        inferenceServiceRateLimitCalculator = mock();
         licenseState = mock();
         inferenceEndpointRegistry = mock();
         serviceRegistry = mock();
@@ -98,7 +95,6 @@ public void setUp() throws Exception {
             serviceRegistry,
             inferenceStats,
             streamingTaskManager,
-            inferenceServiceRateLimitCalculator,
             nodeClient,
             threadPool
         );
@@ -115,7 +111,6 @@ protected abstract BaseTransportInferenceAction<Request> createAction(
         InferenceServiceRegistry serviceRegistry,
         InferenceStats inferenceStats,
         StreamingTaskManager streamingTaskManager,
-        InferenceServiceRateLimitCalculator inferenceServiceNodeLocalRateLimitCalculator,
         NodeClient nodeClient,
         ThreadPool threadPool
     );
diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/TransportInferenceActionTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/TransportInferenceActionTests.java
@@ -9,34 +9,17 @@
 
 import org.elasticsearch.action.support.ActionFilters;
 import org.elasticsearch.client.internal.node.NodeClient;
-import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.inference.InferenceServiceRegistry;
 import org.elasticsearch.inference.TaskType;
 import org.elasticsearch.inference.telemetry.InferenceStats;
 import org.elasticsearch.license.MockLicenseState;
 import org.elasticsearch.threadpool.ThreadPool;
-import org.elasticsearch.transport.TransportException;
-import org.elasticsearch.transport.TransportResponseHandler;
 import org.elasticsearch.transport.TransportService;
 import org.elasticsearch.xpack.core.inference.action.InferenceAction;
 import org.elasticsearch.xpack.inference.action.task.StreamingTaskManager;
-import org.elasticsearch.xpack.inference.common.InferenceServiceRateLimitCalculator;
-import org.elasticsearch.xpack.inference.common.RateLimitAssignment;
 import org.elasticsearch.xpack.inference.registry.InferenceEndpointRegistry;
 
-import java.util.List;
-
-import static org.hamcrest.Matchers.is;
-import static org.mockito.ArgumentMatchers.any;
-import static org.mockito.ArgumentMatchers.anyLong;
-import static org.mockito.ArgumentMatchers.assertArg;
-import static org.mockito.ArgumentMatchers.eq;
-import static org.mockito.ArgumentMatchers.same;
-import static org.mockito.Mockito.doAnswer;
 import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.never;
-import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.when;
 
 public class TransportInferenceActionTests extends BaseTransportInferenceActionTestCase<InferenceAction.Request> {
 
@@ -53,7 +36,6 @@ protected BaseTransportInferenceAction<InferenceAction.Request> createAction(
         InferenceServiceRegistry serviceRegistry,
         InferenceStats inferenceStats,
         StreamingTaskManager streamingTaskManager,
-        InferenceServiceRateLimitCalculator inferenceServiceNodeLocalRateLimitCalculator,
         NodeClient nodeClient,
         ThreadPool threadPool
     ) {
@@ -65,7 +47,6 @@ protected BaseTransportInferenceAction<InferenceAction.Request> createAction(
             serviceRegistry,
             inferenceStats,
             streamingTaskManager,
-            inferenceServiceNodeLocalRateLimitCalculator,
             nodeClient,
             threadPool
         );
@@ -75,136 +56,4 @@ protected BaseTransportInferenceAction<InferenceAction.Request> createAction(
     protected InferenceAction.Request createRequest() {
         return mock(InferenceAction.Request.class);
     }
-
-    public void testNoRerouting_WhenTaskTypeNotSupported() {
-        TaskType unsupportedTaskType = TaskType.COMPLETION;
-        mockService(listener -> listener.onResponse(mock()));
-
-        when(inferenceServiceRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, unsupportedTaskType)).thenReturn(false);
-
-        var listener = doExecute(unsupportedTaskType);
-
-        verify(listener).onResponse(any());
-        // Verify request was handled locally (not rerouted using TransportService)
-        verify(transportService, never()).sendRequest(any(), any(), any(), any());
-        // Verify request metric attributes were recorded on the node performing inference
-        verify(inferenceStats.inferenceDuration()).record(anyLong(), assertArg(attributes -> {
-            assertThat(attributes.get("rerouted"), is(Boolean.FALSE));
-            assertThat(attributes.get("node_id"), is(localNodeId));
-        }));
-    }
-
-    public void testNoRerouting_WhenNoGroupingCalculatedYet() {
-        mockService(listener -> listener.onResponse(mock()));
-
-        when(inferenceServiceRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, taskType)).thenReturn(true);
-        when(inferenceServiceRateLimitCalculator.getRateLimitAssignment(serviceId, taskType)).thenReturn(null);
-
-        var listener = doExecute(taskType);
-
-        verify(listener).onResponse(any());
-        // Verify request was handled locally (not rerouted using TransportService)
-        verify(transportService, never()).sendRequest(any(), any(), any(), any());
-        // Verify request metric attributes were recorded on the node performing inference
-        verify(inferenceStats.inferenceDuration()).record(anyLong(), assertArg(attributes -> {
-            assertThat(attributes.get("rerouted"), is(Boolean.FALSE));
-            assertThat(attributes.get("node_id"), is(localNodeId));
-        }));
-    }
-
-    public void testNoRerouting_WhenEmptyNodeList() {
-        mockService(listener -> listener.onResponse(mock()));
-
-        when(inferenceServiceRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, taskType)).thenReturn(true);
-        when(inferenceServiceRateLimitCalculator.getRateLimitAssignment(serviceId, taskType)).thenReturn(
-            new RateLimitAssignment(List.of())
-        );
-
-        var listener = doExecute(taskType);
-
-        verify(listener).onResponse(any());
-        // Verify request was handled locally (not rerouted using TransportService)
-        verify(transportService, never()).sendRequest(any(), any(), any(), any());
-        // Verify request metric attributes were recorded on the node performing inference
-        verify(inferenceStats.inferenceDuration()).record(anyLong(), assertArg(attributes -> {
-            assertThat(attributes.get("rerouted"), is(Boolean.FALSE));
-            assertThat(attributes.get("node_id"), is(localNodeId));
-        }));
-    }
-
-    public void testRerouting_ToOtherNode() {
-        DiscoveryNode otherNode = mock(DiscoveryNode.class);
-        when(otherNode.getId()).thenReturn("other-node");
-
-        // The local node is different to the "other-node" responsible for serviceId
-        when(nodeClient.getLocalNodeId()).thenReturn("local-node");
-        when(inferenceServiceRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, taskType)).thenReturn(true);
-        // Requests for serviceId are always routed to "other-node"
-        var assignment = new RateLimitAssignment(List.of(otherNode));
-        when(inferenceServiceRateLimitCalculator.getRateLimitAssignment(serviceId, taskType)).thenReturn(assignment);
-
-        mockService(listener -> listener.onResponse(mock()));
-        var listener = doExecute(taskType);
-
-        // Verify request was rerouted
-        verify(transportService).sendRequest(same(otherNode), eq(InferenceAction.NAME), any(), any());
-        // Verify local execution didn't happen
-        verify(listener, never()).onResponse(any());
-        // Verify that request metric attributes were NOT recorded on the node rerouting the request to another node
-        verify(inferenceStats.inferenceDuration(), never()).record(anyLong(), any());
-    }
-
-    public void testRerouting_ToLocalNode_WithoutGoingThroughTransportLayerAgain() {
-        DiscoveryNode localNode = mock(DiscoveryNode.class);
-        String localNodeId = "local-node";
-        when(localNode.getId()).thenReturn(localNodeId);
-
-        // The local node is the only one responsible for serviceId
-        when(nodeClient.getLocalNodeId()).thenReturn(localNodeId);
-        when(inferenceServiceRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, taskType)).thenReturn(true);
-        var assignment = new RateLimitAssignment(List.of(localNode));
-        when(inferenceServiceRateLimitCalculator.getRateLimitAssignment(serviceId, taskType)).thenReturn(assignment);
-
-        mockService(listener -> listener.onResponse(mock()));
-        var listener = doExecute(taskType);
-
-        verify(listener).onResponse(any());
-        // Verify request was handled locally (not rerouted using TransportService)
-        verify(transportService, never()).sendRequest(any(), any(), any(), any());
-        // Verify request metric attributes were recorded on the node performing inference
-        verify(inferenceStats.inferenceDuration()).record(anyLong(), assertArg(attributes -> {
-            assertThat(attributes.get("rerouted"), is(Boolean.FALSE));
-            assertThat(attributes.get("node_id"), is(localNodeId));
-        }));
-    }
-
-    public void testRerouting_HandlesTransportException_FromOtherNode() {
-        DiscoveryNode otherNode = mock(DiscoveryNode.class);
-        when(otherNode.getId()).thenReturn("other-node");
-
-        when(nodeClient.getLocalNodeId()).thenReturn("local-node");
-        when(inferenceServiceRateLimitCalculator.isTaskTypeReroutingSupported(serviceId, taskType)).thenReturn(true);
-        var assignment = new RateLimitAssignment(List.of(otherNode));
-        when(inferenceServiceRateLimitCalculator.getRateLimitAssignment(serviceId, taskType)).thenReturn(assignment);
-
-        mockService(listener -> listener.onResponse(mock()));
-
-        TransportException expectedException = new TransportException("Failed to route");
-        doAnswer(invocation -> {
-            TransportResponseHandler<?> handler = invocation.getArgument(3);
-            handler.handleException(expectedException);
-            return null;
-        }).when(transportService).sendRequest(any(), any(), any(), any());
-
-        var listener = doExecute(taskType);
-
-        // Verify request was rerouted
-        verify(transportService).sendRequest(same(otherNode), eq(InferenceAction.NAME), any(), any());
-        // Verify local execution didn't happen
-        verify(listener, never()).onResponse(any());
-        // Verify exception was propagated from "other-node" to "local-node"
-        verify(listener).onFailure(same(expectedException));
-        // Verify that request metric attributes were NOT recorded on the node rerouting the request to another node
-        verify(inferenceStats.inferenceDuration(), never()).record(anyLong(), any());
-    }
 }
diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/TransportUnifiedCompletionActionTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/TransportUnifiedCompletionActionTests.java
@@ -19,7 +19,6 @@
 import org.elasticsearch.xpack.core.inference.action.UnifiedCompletionAction;
 import org.elasticsearch.xpack.core.inference.results.UnifiedChatCompletionException;
 import org.elasticsearch.xpack.inference.action.task.StreamingTaskManager;
-import org.elasticsearch.xpack.inference.common.InferenceServiceRateLimitCalculator;
 import org.elasticsearch.xpack.inference.registry.InferenceEndpointRegistry;
 
 import java.util.Optional;
@@ -49,7 +48,6 @@ protected BaseTransportInferenceAction<UnifiedCompletionAction.Request> createAc
         InferenceServiceRegistry serviceRegistry,
         InferenceStats inferenceStats,
         StreamingTaskManager streamingTaskManager,
-        InferenceServiceRateLimitCalculator inferenceServiceRateLimitCalculator,
         NodeClient nodeClient,
         ThreadPool threadPool
     ) {
@@ -61,7 +59,6 @@ protected BaseTransportInferenceAction<UnifiedCompletionAction.Request> createAc
             serviceRegistry,
             inferenceStats,
             streamingTaskManager,
-            inferenceServiceRateLimitCalculator,
             nodeClient,
             threadPool
         );
diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/common/InferenceServiceNodeLocalRateLimitCalculatorTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/common/InferenceServiceNodeLocalRateLimitCalculatorTests.java