leo-hoet · lhoet-google · Apr 29, 2025 · Apr 29, 2025 · Apr 29, 2025 · Apr 30, 2025
diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java
@@ -254,6 +254,7 @@ static TransportVersion def(int id) {
     public static final TransportVersion ESQL_FIELD_ATTRIBUTE_DROP_TYPE = def(9_075_0_00);
     public static final TransportVersion ESQL_TIME_SERIES_SOURCE_STATUS = def(9_076_0_00);
     public static final TransportVersion ESQL_HASH_OPERATOR_STATUS_OUTPUT_TIME = def(9_077_0_00);
+    public static final TransportVersion ML_INFERENCE_VERTEXAI_CHATCOMPLETION_ADDED = def(9_078_0_00);
 
     /*
      * STOP! READ THIS FIRST! No, really,

diff --git a/.../elasticsearch/xpack/inference/external/response/streaming/JsonArrayPartsEventParser.java b/.../elasticsearch/xpack/inference/external/response/streaming/JsonArrayPartsEventParser.java
@@ -0,0 +1,86 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.inference.external.response.streaming;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.ArrayDeque;
+import java.util.Arrays;
+import java.util.Deque;
+
+/**
+ * Parses a stream of bytes that form a JSON array, where each element of the array
+ * is a JSON object. This parser extracts each complete JSON object from the array
+ * and emits it as byte array.
+ *
+ * Example of an expected stream:
+ * Chunk 1: [{"key":"val1"}
+ * Chunk 2: ,{"key2":"val2"}
+ * Chunk 3: ,{"key3":"val3"}, {"some":"object"}]
+ *
+ * This parser would emit four byte arrays, with data:
+ * 1. {"key":"val1"}
+ * 2. {"key2":"val2"}
+ * 3. {"key3":"val3"}
+ * 4. {"some":"object"}
+ */
+public class JsonArrayPartsEventParser {
+
+    // Buffer to hold bytes from the previous call if they formed an incomplete JSON object.
+    private final ByteArrayOutputStream incompletePart = new ByteArrayOutputStream();
+
+    public Deque<byte[]> parse(byte[] newBytes) {
+        if (newBytes == null || newBytes.length == 0) {
+            return new ArrayDeque<>(0);
+        }
+
+        ByteArrayOutputStream currentStream = new ByteArrayOutputStream();
+        try {
+            currentStream.write(incompletePart.toByteArray());
+            currentStream.write(newBytes);
+        } catch (IOException e) {
+            throw new UncheckedIOException("Error handling byte array streams", e);
+        }
+        incompletePart.reset();
+
+        byte[] dataToProcess = currentStream.toByteArray();
+        return parseInternal(dataToProcess);
+    }
+
+    private Deque<byte[]> parseInternal(byte[] data) {
+        int localBraceLevel = 0;
+        int objectStartIndex = -1;
+        Deque<byte[]> completedObjects = new ArrayDeque<>();
+
+        for (int i = 0; i < data.length; i++) {
+            char c = (char) data[i];
+
+            if (c == '{') {
+                if (localBraceLevel == 0) {
+                    objectStartIndex = i;
+                }
+                localBraceLevel++;
+            } else if (c == '}') {
+                if (localBraceLevel > 0) {
+                    localBraceLevel--;
+                    if (localBraceLevel == 0) {
+                        byte[] jsonObject = Arrays.copyOfRange(data, objectStartIndex, i + 1);
+                        completedObjects.offer(jsonObject);
+                        objectStartIndex = -1;
+                    }
+                }
+            }
+        }
+
+        if (localBraceLevel > 0) {
+            incompletePart.write(data, objectStartIndex, data.length - objectStartIndex);
+        }
+        return completedObjects;
+    }
+}
diff --git a/...asticsearch/xpack/inference/external/response/streaming/JsonArrayPartsEventProcessor.java b/...asticsearch/xpack/inference/external/response/streaming/JsonArrayPartsEventProcessor.java
@@ -0,0 +1,37 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.inference.external.response.streaming;
+
+import org.elasticsearch.xpack.inference.common.DelegatingProcessor;
+import org.elasticsearch.xpack.inference.external.http.HttpResult;
+
+import java.util.Deque;
+
+public class JsonArrayPartsEventProcessor extends DelegatingProcessor<HttpResult, Deque<byte[]>> {
+    private final JsonArrayPartsEventParser jsonArrayPartsEventParser;
+
+    public JsonArrayPartsEventProcessor(JsonArrayPartsEventParser jsonArrayPartsEventParser) {
+        this.jsonArrayPartsEventParser = jsonArrayPartsEventParser;
+    }
+
+    @Override
+    public void next(HttpResult item) {
+        if (item.isBodyEmpty()) {
+            upstream().request(1);
+            return;
+        }
+
+        var response = jsonArrayPartsEventParser.parse(item.body());
+        if (response.isEmpty()) {
+            upstream().request(1);
+            return;
+        }
+
+        downstream().onNext(response);
+    }
+}
diff --git a/...earch/xpack/inference/services/googlevertexai/GoogleVertexAiCompletionRequestManager.java b/...earch/xpack/inference/services/googlevertexai/GoogleVertexAiCompletionRequestManager.java
@@ -0,0 +1,73 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.inference.services.googlevertexai;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.inference.InferenceServiceResults;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.elasticsearch.xpack.inference.external.http.retry.RequestSender;
+import org.elasticsearch.xpack.inference.external.http.retry.ResponseHandler;
+import org.elasticsearch.xpack.inference.external.http.sender.ExecutableInferenceRequest;
+import org.elasticsearch.xpack.inference.external.http.sender.InferenceInputs;
+import org.elasticsearch.xpack.inference.external.http.sender.UnifiedChatInput;
+import org.elasticsearch.xpack.inference.services.googlevertexai.completion.GoogleVertexAiChatCompletionModel;
+import org.elasticsearch.xpack.inference.services.googlevertexai.request.GoogleVertexAiUnifiedChatCompletionRequest;
+import org.elasticsearch.xpack.inference.services.googlevertexai.response.GoogleVertexAiChatCompletionResponseEntity;
+
+import java.util.Objects;
+import java.util.function.Supplier;
+
+public class GoogleVertexAiCompletionRequestManager extends GoogleVertexAiRequestManager {
+
+    private static final Logger logger = LogManager.getLogger(GoogleVertexAiCompletionRequestManager.class);
+
+    private static final ResponseHandler HANDLER = createGoogleVertexAiResponseHandler();
+
+    private static ResponseHandler createGoogleVertexAiResponseHandler() {
+        return new GoogleVertexAiUnifiedChatCompletionResponseHandler(
+            "Google Vertex AI chat completion",
+            GoogleVertexAiChatCompletionResponseEntity::fromResponse
+        );
+    }
+
+    private final GoogleVertexAiChatCompletionModel model;
+
+    public GoogleVertexAiCompletionRequestManager(GoogleVertexAiChatCompletionModel model, ThreadPool threadPool) {
+        super(threadPool, model, RateLimitGrouping.of(model));
+        this.model = model;
+    }
+
+    record RateLimitGrouping(int projectIdHash) {
+        public static RateLimitGrouping of(GoogleVertexAiChatCompletionModel model) {
+            Objects.requireNonNull(model);
+            return new RateLimitGrouping(model.rateLimitServiceSettings().projectId().hashCode());
+        }
+    }
+
+    public static GoogleVertexAiCompletionRequestManager of(GoogleVertexAiChatCompletionModel model, ThreadPool threadPool) {
+        Objects.requireNonNull(model);
+        Objects.requireNonNull(threadPool);
+
+        return new GoogleVertexAiCompletionRequestManager(model, threadPool);
+    }
+
+    @Override
+    public void execute(
+        InferenceInputs inferenceInputs,
+        RequestSender requestSender,
+        Supplier<Boolean> hasRequestCompletedFunction,
+        ActionListener<InferenceServiceResults> listener
+    ) {
+
+        var chatInputs = (UnifiedChatInput) inferenceInputs;
-        var chatInputs = (UnifiedChatInput) inferenceInputs;
+        var chatInputs = inferenceInputs.castTo(UnifiedChatInput.class);
-        var chatInputs = (UnifiedChatInput) inferenceInputs;
+        var chatInputs = inferenceInputs.castTo(UnifiedChatInput.class);
+        var request = new GoogleVertexAiUnifiedChatCompletionRequest(chatInputs, model);
+        execute(new ExecutableInferenceRequest(requestSender, logger, request, HANDLER, hasRequestCompletedFunction, listener));
+    }
+}
diff --git a/.../elasticsearch/xpack/inference/services/googlevertexai/GoogleVertexAiResponseHandler.java b/.../elasticsearch/xpack/inference/services/googlevertexai/GoogleVertexAiResponseHandler.java
@@ -9,11 +9,14 @@
 
 import org.elasticsearch.xpack.inference.external.http.HttpResult;
 import org.elasticsearch.xpack.inference.external.http.retry.BaseResponseHandler;
+import org.elasticsearch.xpack.inference.external.http.retry.ErrorResponse;
 import org.elasticsearch.xpack.inference.external.http.retry.ResponseParser;
 import org.elasticsearch.xpack.inference.external.http.retry.RetryException;
 import org.elasticsearch.xpack.inference.external.request.Request;
 import org.elasticsearch.xpack.inference.services.googlevertexai.response.GoogleVertexAiErrorResponseEntity;
 
+import java.util.function.Function;
+
 import static org.elasticsearch.core.Strings.format;
 
 public class GoogleVertexAiResponseHandler extends BaseResponseHandler {
@@ -24,6 +27,15 @@ public GoogleVertexAiResponseHandler(String requestType, ResponseParser parseFun
         super(requestType, parseFunction, GoogleVertexAiErrorResponseEntity::fromResponse);
     }
 
+    public GoogleVertexAiResponseHandler(
+        String requestType,
+        ResponseParser parseFunction,
+        Function<HttpResult, ErrorResponse> errorParseFunction,
+        boolean canHandleStreamingResponses
+    ) {
+        super(requestType, parseFunction, errorParseFunction, canHandleStreamingResponses);
+    }
+
     @Override
     protected void checkForFailureStatusCode(Request request, HttpResult result) throws RetryException {
         if (result.isSuccessfulResponse()) {

diff --git a/...g/elasticsearch/xpack/inference/services/googlevertexai/GoogleVertexAiSecretSettings.java b/...g/elasticsearch/xpack/inference/services/googlevertexai/GoogleVertexAiSecretSettings.java
@@ -124,9 +124,8 @@ public static Map<String, SettingsConfiguration> get() {
                 var configurationMap = new HashMap<String, SettingsConfiguration>();
                 configurationMap.put(
                     SERVICE_ACCOUNT_JSON,
-                    new SettingsConfiguration.Builder(EnumSet.of(TaskType.TEXT_EMBEDDING, TaskType.RERANK)).setDescription(
-                        "API Key for the provider you're connecting to."
-                    )
+                    new SettingsConfiguration.Builder(EnumSet.of(TaskType.TEXT_EMBEDDING, TaskType.RERANK, TaskType.CHAT_COMPLETION))
+                        .setDescription("API Key for the provider you're connecting to.")
                         .setLabel("Credentials JSON")
                         .setRequired(true)
                         .setSensitive(true)

diff --git a/...java/org/elasticsearch/xpack/inference/services/googlevertexai/GoogleVertexAiService.java b/...java/org/elasticsearch/xpack/inference/services/googlevertexai/GoogleVertexAiService.java
@@ -29,6 +29,7 @@
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.xpack.inference.chunking.ChunkingSettingsBuilder;
 import org.elasticsearch.xpack.inference.chunking.EmbeddingRequestChunker;
+import org.elasticsearch.xpack.inference.external.action.SenderExecutableAction;
 import org.elasticsearch.xpack.inference.external.http.sender.EmbeddingsInput;
 import org.elasticsearch.xpack.inference.external.http.sender.HttpRequestSender;
 import org.elasticsearch.xpack.inference.external.http.sender.InferenceInputs;
@@ -38,6 +39,7 @@
 import org.elasticsearch.xpack.inference.services.ServiceComponents;
 import org.elasticsearch.xpack.inference.services.ServiceUtils;
 import org.elasticsearch.xpack.inference.services.googlevertexai.action.GoogleVertexAiActionCreator;
+import org.elasticsearch.xpack.inference.services.googlevertexai.completion.GoogleVertexAiChatCompletionModel;
 import org.elasticsearch.xpack.inference.services.googlevertexai.embeddings.GoogleVertexAiEmbeddingsModel;
 import org.elasticsearch.xpack.inference.services.googlevertexai.embeddings.GoogleVertexAiEmbeddingsServiceSettings;
 import org.elasticsearch.xpack.inference.services.googlevertexai.rerank.GoogleVertexAiRerankModel;
@@ -47,25 +49,31 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
+import static org.elasticsearch.xpack.inference.external.action.ActionUtils.constructFailedToSendRequestMessage;
 import static org.elasticsearch.xpack.inference.services.ServiceFields.MODEL_ID;
 import static org.elasticsearch.xpack.inference.services.ServiceUtils.createInvalidModelException;
 import static org.elasticsearch.xpack.inference.services.ServiceUtils.parsePersistedConfigErrorMsg;
 import static org.elasticsearch.xpack.inference.services.ServiceUtils.removeFromMap;
 import static org.elasticsearch.xpack.inference.services.ServiceUtils.removeFromMapOrDefaultEmpty;
 import static org.elasticsearch.xpack.inference.services.ServiceUtils.removeFromMapOrThrowIfNull;
 import static org.elasticsearch.xpack.inference.services.ServiceUtils.throwIfNotEmptyMap;
-import static org.elasticsearch.xpack.inference.services.ServiceUtils.throwUnsupportedUnifiedCompletionOperation;
 import static org.elasticsearch.xpack.inference.services.googlevertexai.GoogleVertexAiServiceFields.EMBEDDING_MAX_BATCH_SIZE;
 import static org.elasticsearch.xpack.inference.services.googlevertexai.GoogleVertexAiServiceFields.LOCATION;
 import static org.elasticsearch.xpack.inference.services.googlevertexai.GoogleVertexAiServiceFields.PROJECT_ID;
+import static org.elasticsearch.xpack.inference.services.googlevertexai.action.GoogleVertexAiActionCreator.COMPLETION_ERROR_PREFIX;
 
 public class GoogleVertexAiService extends SenderService {
 
     public static final String NAME = "googlevertexai";
 
     private static final String SERVICE_NAME = "Google Vertex AI";
-    private static final EnumSet<TaskType> supportedTaskTypes = EnumSet.of(TaskType.TEXT_EMBEDDING, TaskType.RERANK);
+    private static final EnumSet<TaskType> supportedTaskTypes = EnumSet.of(
+        TaskType.TEXT_EMBEDDING,
+        TaskType.RERANK,
+        TaskType.CHAT_COMPLETION
+    );
 
     public static final EnumSet<InputType> VALID_INPUT_TYPE_VALUES = EnumSet.of(
         InputType.INGEST,
@@ -76,6 +84,11 @@ public class GoogleVertexAiService extends SenderService {
         InputType.INTERNAL_SEARCH
     );
 
+    @Override
+    public Set<TaskType> supportedStreamingTasks() {
+        return EnumSet.of(TaskType.CHAT_COMPLETION);
+    }
+
     public GoogleVertexAiService(HttpRequestSender.Factory factory, ServiceComponents serviceComponents) {
         super(factory, serviceComponents);
     }
@@ -220,7 +233,18 @@ protected void doUnifiedCompletionInfer(
         TimeValue timeout,
         ActionListener<InferenceServiceResults> listener
     ) {
-        throwUnsupportedUnifiedCompletionOperation(NAME);
+        if (model instanceof GoogleVertexAiChatCompletionModel == false) {
+            listener.onFailure(createInvalidModelException(model));
+            return;
+        }
+        var chatCompletionModel = (GoogleVertexAiChatCompletionModel) model;
+        var updatedChatCompletionModel = GoogleVertexAiChatCompletionModel.of(chatCompletionModel, inputs.getRequest());
+
+        var manager = GoogleVertexAiCompletionRequestManager.of(updatedChatCompletionModel, getServiceComponents().threadPool());
+
+        var errorMessage = constructFailedToSendRequestMessage(COMPLETION_ERROR_PREFIX);
+        var action = new SenderExecutableAction(getSender(), manager, errorMessage);
+        action.execute(inputs, timeout, listener);
     }
 
     @Override
@@ -320,6 +344,17 @@ private static GoogleVertexAiModel createModel(
                 secretSettings,
                 context
             );
+
+            case CHAT_COMPLETION -> new GoogleVertexAiChatCompletionModel(
+                inferenceEntityId,
+                taskType,
+                NAME,
+                serviceSettings,
+                taskSettings,
+                secretSettings,
+                context
+            );
+
             default -> throw new ElasticsearchStatusException(failureMessage, RestStatus.BAD_REQUEST);
         };
     }
@@ -348,7 +383,7 @@ public static InferenceServiceConfiguration get() {
 
                 configurationMap.put(
                     LOCATION,
-                    new SettingsConfiguration.Builder(EnumSet.of(TaskType.TEXT_EMBEDDING)).setDescription(
+                    new SettingsConfiguration.Builder(EnumSet.of(TaskType.TEXT_EMBEDDING, TaskType.CHAT_COMPLETION)).setDescription(
                         "Please provide the GCP region where the Vertex AI API(s) is enabled. "
                             + "For more information, refer to the {geminiVertexAIDocs}."
                     )