Moved rateLimitGroupHashing to subclasses of GoogleVertexAiModel

leo-hoet · leo-hoet · commit ab1fe7ad434a · 2025-06-04T14:53:08.000-03:00
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/googlevertexai/GoogleVertexAiModel.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/googlevertexai/GoogleVertexAiModel.java
@@ -60,15 +60,6 @@ public URI nonStreamingUri() {
         return nonStreamingUri;
     }
 
-    @Override
-    public int rateLimitGroupingHash() {
-        // In VertexAI rate limiting is scoped to the project, region and model. URI already has this information so we are using that.
-        // API Key does not affect the quota
-        // https://ai.google.dev/gemini-api/docs/rate-limits
-        // https://cloud.google.com/vertex-ai/docs/quotas
-        return Objects.hash(nonStreamingUri);
-    }
-
     @Override
     public RateLimitSettings rateLimitSettings() {
         return rateLimitServiceSettings().rateLimitSettings();
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/googlevertexai/GoogleVertexAiStreamingProcessor.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/googlevertexai/GoogleVertexAiStreamingProcessor.java
@@ -25,9 +25,6 @@
 import java.util.Objects;
 import java.util.stream.Stream;
 
-import static org.elasticsearch.common.xcontent.XContentParserUtils.ensureExpectedToken;
-import static org.elasticsearch.xpack.inference.external.response.XContentUtils.moveToFirstToken;
-
 public class GoogleVertexAiStreamingProcessor extends DelegatingProcessor<Deque<ServerSentEvent>, InferenceServiceResults.Result> {
 
     @Override
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/googlevertexai/completion/GoogleVertexAiChatCompletionModel.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/googlevertexai/completion/GoogleVertexAiChatCompletionModel.java
@@ -156,4 +156,25 @@ public static URI buildUriStreaming(String location, String projectId, String mo
             .setCustomQuery(GoogleVertexAiUtils.QUERY_PARAM_ALT_SSE)
             .build();
     }
+
+    @Override
+    public int rateLimitGroupingHash() {
+        // In VertexAI rate limiting is scoped to the project, region, model and endpoint.
+        // API Key does not affect the quota
+        // https://ai.google.dev/gemini-api/docs/rate-limits
+        // https://cloud.google.com/vertex-ai/docs/quotas
+        var projectId = getServiceSettings().projectId();
+        var location = getServiceSettings().location();
+        var modelId = getServiceSettings().modelId();
+
+        // Since we don't beforehand know which API is going to be used, we take a conservative approach and
+        // count both endpoint for the rate limit
+        return Objects.hash(
+            projectId,
+            location,
+            modelId,
+            GoogleVertexAiUtils.GENERATE_CONTENT,
+            GoogleVertexAiUtils.STREAM_GENERATE_CONTENT
+        );
+    }
 }
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/googlevertexai/embeddings/GoogleVertexAiEmbeddingsModel.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/googlevertexai/embeddings/GoogleVertexAiEmbeddingsModel.java
@@ -23,6 +23,7 @@
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.Map;
+import java.util.Objects;
 
 import static org.elasticsearch.core.Strings.format;
 
@@ -150,4 +151,17 @@ public static URI buildUri(String location, String projectId, String modelId) th
             )
             .build();
     }
+
+    @Override
+    public int rateLimitGroupingHash() {
+        // In VertexAI rate limiting is scoped to the project, region, model and endpoint.
+        // API Key does not affect the quota
+        // https://ai.google.dev/gemini-api/docs/rate-limits
+        // https://cloud.google.com/vertex-ai/docs/quotas
+        var projectId = getServiceSettings().projectId();
+        var location = getServiceSettings().location();
+        var modelId = getServiceSettings().modelId();
+
+        return Objects.hash(projectId, location, modelId, GoogleVertexAiUtils.PREDICT);
+    }
 }
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/googlevertexai/rerank/GoogleVertexAiRerankModel.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/googlevertexai/rerank/GoogleVertexAiRerankModel.java
@@ -22,6 +22,7 @@
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.Map;
+import java.util.Objects;
 
 import static org.elasticsearch.core.Strings.format;
 
@@ -132,4 +133,17 @@ public static URI buildUri(String projectId) throws URISyntaxException {
             )
             .build();
     }
+
+    @Override
+    public int rateLimitGroupingHash() {
+        // In VertexAI rate limiting is scoped to the project, region, model and endpoint.
+        // API Key does not affect the quota
+        // https://ai.google.dev/gemini-api/docs/rate-limits
+        // https://cloud.google.com/vertex-ai/docs/quotas
+        var projectId = getServiceSettings().projectId();
+        var modelId = getServiceSettings().modelId();
+
+        // In this case, we don't include the location since it's a global endpoint
+        return Objects.hash(projectId, modelId);
+    }
 }