elastic · jonathan-buttner · Jun 9, 2025 · May 30, 2025 · May 30, 2025 · May 30, 2025
diff --git a/.../xpack/inference/services/googlevertexai/GoogleVertexAiChatCompletionResponseHandler.java b/.../xpack/inference/services/googlevertexai/GoogleVertexAiChatCompletionResponseHandler.java
@@ -0,0 +1,22 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.inference.services.googlevertexai;
+
+import org.elasticsearch.xpack.inference.services.googlevertexai.response.GoogleVertexAiCompletionResponseEntity;
+
+public class GoogleVertexAiChatCompletionResponseHandler extends GoogleVertexAiResponseHandler {
+
+    public GoogleVertexAiChatCompletionResponseHandler(String requestType) {
+        super(
+            requestType,
+            GoogleVertexAiCompletionResponseEntity::fromResponse,
+            GoogleVertexAiUnifiedChatCompletionResponseHandler.GoogleVertexAiErrorResponse::fromResponse,
+            true
+        );
+    }
+}
diff --git a/.../elasticsearch/xpack/inference/services/googlevertexai/GoogleVertexAiResponseHandler.java b/.../elasticsearch/xpack/inference/services/googlevertexai/GoogleVertexAiResponseHandler.java
@@ -7,14 +7,19 @@
 
 package org.elasticsearch.xpack.inference.services.googlevertexai;
 
+import org.elasticsearch.inference.InferenceServiceResults;
+import org.elasticsearch.xpack.core.inference.results.StreamingChatCompletionResults;
 import org.elasticsearch.xpack.inference.external.http.HttpResult;
 import org.elasticsearch.xpack.inference.external.http.retry.BaseResponseHandler;
 import org.elasticsearch.xpack.inference.external.http.retry.ErrorResponse;
 import org.elasticsearch.xpack.inference.external.http.retry.ResponseParser;
 import org.elasticsearch.xpack.inference.external.http.retry.RetryException;
 import org.elasticsearch.xpack.inference.external.request.Request;
+import org.elasticsearch.xpack.inference.external.response.streaming.ServerSentEventParser;
+import org.elasticsearch.xpack.inference.external.response.streaming.ServerSentEventProcessor;
 import org.elasticsearch.xpack.inference.services.googlevertexai.response.GoogleVertexAiErrorResponseEntity;
 
+import java.util.concurrent.Flow;
 import java.util.function.Function;
 
 import static org.elasticsearch.core.Strings.format;
@@ -66,4 +71,14 @@ protected void checkForFailureStatusCode(Request request, HttpResult result) thr
     private static String resourceNotFoundError(Request request) {
         return format("Resource not found at [%s]", request.getURI());
     }
+
+    @Override
+    public InferenceServiceResults parseResult(Request request, Flow.Publisher<HttpResult> flow) {
+        var serverSentEventProcessor = new ServerSentEventProcessor(new ServerSentEventParser());
+        var googleVertexAiProcessor = new GoogleVertexAiStreamingProcessor();
+
+        flow.subscribe(serverSentEventProcessor);
+        serverSentEventProcessor.subscribe(googleVertexAiProcessor);
+        return new StreamingChatCompletionResults(googleVertexAiProcessor);
+    }
 }
diff --git a/...java/org/elasticsearch/xpack/inference/services/googlevertexai/GoogleVertexAiService.java b/...java/org/elasticsearch/xpack/inference/services/googlevertexai/GoogleVertexAiService.java
@@ -95,7 +95,7 @@ public class GoogleVertexAiService extends SenderService {
 
     @Override
     public Set<TaskType> supportedStreamingTasks() {
-        return EnumSet.of(TaskType.CHAT_COMPLETION);
+        return EnumSet.of(TaskType.CHAT_COMPLETION, TaskType.COMPLETION);
     }
 
     public GoogleVertexAiService(HttpRequestSender.Factory factory, ServiceComponents serviceComponents) {
@@ -222,11 +222,11 @@ protected void doInfer(
             return;
         }
 
-        var completionModel = (GoogleVertexAiCompletionModel) model;
+        GoogleVertexAiModel googleVertexAiModel = (GoogleVertexAiModel) model;
 
         var actionCreator = new GoogleVertexAiActionCreator(getSender(), getServiceComponents());
 
-        var action = completionModel.accept(actionCreator, taskSettings);
+        var action = googleVertexAiModel.accept(actionCreator, taskSettings);
         action.execute(inputs, timeout, listener);
     }
 

diff --git a/...asticsearch/xpack/inference/services/googlevertexai/GoogleVertexAiStreamingProcessor.java b/...asticsearch/xpack/inference/services/googlevertexai/GoogleVertexAiStreamingProcessor.java
@@ -0,0 +1,69 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.inference.services.googlevertexai;
+
+import org.elasticsearch.ElasticsearchStatusException;
+import org.elasticsearch.common.xcontent.LoggingDeprecationHandler;
+import org.elasticsearch.inference.InferenceServiceResults;
+import org.elasticsearch.rest.RestStatus;
+import org.elasticsearch.xcontent.XContentFactory;
+import org.elasticsearch.xcontent.XContentParser;
+import org.elasticsearch.xcontent.XContentParserConfiguration;
+import org.elasticsearch.xcontent.XContentType;
+import org.elasticsearch.xpack.core.inference.results.StreamingChatCompletionResults;
+import org.elasticsearch.xpack.inference.common.DelegatingProcessor;
+import org.elasticsearch.xpack.inference.external.response.streaming.ServerSentEvent;
+
+import java.io.IOException;
+import java.util.Deque;
+import java.util.Objects;
+import java.util.stream.Stream;
+
+import static org.elasticsearch.common.xcontent.XContentParserUtils.ensureExpectedToken;
+import static org.elasticsearch.xpack.inference.external.response.XContentUtils.moveToFirstToken;
+
+public class GoogleVertexAiStreamingProcessor extends DelegatingProcessor<Deque<ServerSentEvent>, InferenceServiceResults.Result> {
+
+    @Override
+    protected void next(Deque<ServerSentEvent> item) throws Exception {
+        var parserConfig = XContentParserConfiguration.EMPTY.withDeprecationHandler(LoggingDeprecationHandler.INSTANCE);
+        var results = parseEvent(item, GoogleVertexAiStreamingProcessor::parse, parserConfig);
+
+        if (results.isEmpty()) {
+            upstream().request(1);
+        } else {
+            downstream().onNext(new StreamingChatCompletionResults.Results(results));
+        }
+    }
+
+    public static Stream<StreamingChatCompletionResults.Result> parse(XContentParserConfiguration parserConfig, ServerSentEvent event) {
+        String data = event.data();
+        try (XContentParser jsonParser = XContentFactory.xContent(XContentType.JSON).createParser(parserConfig, data)) {
+            moveToFirstToken(jsonParser);
+            ensureExpectedToken(XContentParser.Token.START_OBJECT, jsonParser.currentToken(), jsonParser);
+
+            var chunk = GoogleVertexAiUnifiedStreamingProcessor.GoogleVertexAiChatCompletionChunkParser.parse(jsonParser);
+
+            return chunk.choices()
+                .stream()
+                .map(choice -> choice.delta())
+                .filter(Objects::nonNull)
+                .map(delta -> delta.content())
+                .filter(content -> content != null && content.isEmpty() == false)
+                .map(StreamingChatCompletionResults.Result::new);
+
+        } catch (IOException e) {
+            throw new ElasticsearchStatusException(
+                "Failed to parse event from inference provider: {}",
+                RestStatus.INTERNAL_SERVER_ERROR,
+                e,
+                event
+            );
+        }
+    }
+}
diff --git a/...inference/services/googlevertexai/GoogleVertexAiUnifiedChatCompletionResponseHandler.java b/...inference/services/googlevertexai/GoogleVertexAiUnifiedChatCompletionResponseHandler.java
@@ -61,6 +61,8 @@ public InferenceServiceResults parseResult(Request request, Flow.Publisher<HttpR
 
     @Override
     protected Exception buildError(String message, Request request, HttpResult result, ErrorResponse errorResponse) {
+        assert request.isStreaming() : "Only streaming requests support this format";
+
         var responseStatusCode = result.response().getStatusLine().getStatusCode();
         var errorMessage = errorMessage(message, request, result, errorResponse, responseStatusCode);
         var restStatus = toRestStatus(responseStatusCode);
@@ -108,7 +110,7 @@ private static Exception buildMidStreamError(Request request, String message, Ex
         }
     }
 
-    private static class GoogleVertexAiErrorResponse extends ErrorResponse {
+    public static class GoogleVertexAiErrorResponse extends ErrorResponse {
         private static final Logger logger = LogManager.getLogger(GoogleVertexAiErrorResponse.class);
         private static final ConstructingObjectParser<Optional<ErrorResponse>, Void> ERROR_PARSER = new ConstructingObjectParser<>(
             "google_vertex_ai_error_wrapper",
@@ -135,7 +137,7 @@ private static class GoogleVertexAiErrorResponse extends ErrorResponse {
             );
         }
 
-        static ErrorResponse fromResponse(HttpResult response) {
+        public static ErrorResponse fromResponse(HttpResult response) {
             try (
                 XContentParser parser = XContentFactory.xContent(XContentType.JSON)
                     .createParser(XContentParserConfiguration.EMPTY, response.body())

diff --git a/...ticsearch/xpack/inference/services/googlevertexai/action/GoogleVertexAiActionCreator.java b/...ticsearch/xpack/inference/services/googlevertexai/action/GoogleVertexAiActionCreator.java
@@ -7,6 +7,8 @@
 
 package org.elasticsearch.xpack.inference.services.googlevertexai.action;
 
+import org.elasticsearch.ElasticsearchStatusException;
+import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.xpack.inference.external.action.ExecutableAction;
 import org.elasticsearch.xpack.inference.external.action.SenderExecutableAction;
 import org.elasticsearch.xpack.inference.external.action.SingleInputSenderExecutableAction;
@@ -16,14 +18,17 @@
 import org.elasticsearch.xpack.inference.external.http.sender.Sender;
 import org.elasticsearch.xpack.inference.external.http.sender.UnifiedChatInput;
 import org.elasticsearch.xpack.inference.services.ServiceComponents;
+import org.elasticsearch.xpack.inference.services.googlevertexai.GoogleVertexAiChatCompletionResponseHandler;
 import org.elasticsearch.xpack.inference.services.googlevertexai.GoogleVertexAiEmbeddingsRequestManager;
 import org.elasticsearch.xpack.inference.services.googlevertexai.GoogleVertexAiRerankRequestManager;
 import org.elasticsearch.xpack.inference.services.googlevertexai.GoogleVertexAiUnifiedChatCompletionResponseHandler;
 import org.elasticsearch.xpack.inference.services.googlevertexai.completion.GoogleVertexAiChatCompletionModel;
+import org.elasticsearch.xpack.inference.services.googlevertexai.completion.GoogleVertexAiCompletionModel;
 import org.elasticsearch.xpack.inference.services.googlevertexai.embeddings.GoogleVertexAiEmbeddingsModel;
 import org.elasticsearch.xpack.inference.services.googlevertexai.request.GoogleVertexAiUnifiedChatCompletionRequest;
 import org.elasticsearch.xpack.inference.services.googlevertexai.rerank.GoogleVertexAiRerankModel;
 
+import java.net.URISyntaxException;
 import java.util.Map;
 import java.util.Objects;
 
@@ -36,9 +41,10 @@ public class GoogleVertexAiActionCreator implements GoogleVertexAiActionVisitor
 
     private final ServiceComponents serviceComponents;
 
-    static final ResponseHandler COMPLETION_HANDLER = new GoogleVertexAiUnifiedChatCompletionResponseHandler(
+    static final ResponseHandler UNIFIED_CHAT_COMPLETION_HANDLER = new GoogleVertexAiUnifiedChatCompletionResponseHandler(
         "Google VertexAI chat completion"
     );
+    static final ResponseHandler CHAT_COMPLETION_HANDLER = new GoogleVertexAiChatCompletionResponseHandler("Google VertexAI completion");
     static final String USER_ROLE = "user";
 
     public GoogleVertexAiActionCreator(Sender sender, ServiceComponents serviceComponents) {
@@ -72,11 +78,31 @@ public ExecutableAction create(GoogleVertexAiChatCompletionModel model, Map<Stri
         var manager = new GenericRequestManager<>(
             serviceComponents.threadPool(),
             model,
-            COMPLETION_HANDLER,
+            UNIFIED_CHAT_COMPLETION_HANDLER,
             inputs -> new GoogleVertexAiUnifiedChatCompletionRequest(new UnifiedChatInput(inputs, USER_ROLE), model),
             ChatCompletionInput.class
         );
 
         return new SingleInputSenderExecutableAction(sender, manager, failedToSendRequestErrorMessage, COMPLETION_ERROR_PREFIX);
     }
+
+    @Override
+    public ExecutableAction create(GoogleVertexAiCompletionModel model, Map<String, Object> taskSettings) {
+        var failedToSendRequestErrorMessage = constructFailedToSendRequestMessage(COMPLETION_ERROR_PREFIX);
+
+        var manager = new GenericRequestManager<>(serviceComponents.threadPool(), model, CHAT_COMPLETION_HANDLER, inputs -> {
+            try {
+                model.updateUri(inputs.stream());
+            } catch (URISyntaxException e) {
+                throw new ElasticsearchStatusException(
+                    "Error constructing URI for Google VertexAI completion",
+                    RestStatus.INTERNAL_SERVER_ERROR,
+                    e
+                );
+            }
+            return new GoogleVertexAiUnifiedChatCompletionRequest(new UnifiedChatInput(inputs, USER_ROLE), model);
+        }, ChatCompletionInput.class);
+
+        return new SingleInputSenderExecutableAction(sender, manager, failedToSendRequestErrorMessage, COMPLETION_ERROR_PREFIX);
+    }
 }
diff --git a/...ticsearch/xpack/inference/services/googlevertexai/action/GoogleVertexAiActionVisitor.java b/...ticsearch/xpack/inference/services/googlevertexai/action/GoogleVertexAiActionVisitor.java
@@ -9,6 +9,7 @@
 
 import org.elasticsearch.xpack.inference.external.action.ExecutableAction;
 import org.elasticsearch.xpack.inference.services.googlevertexai.completion.GoogleVertexAiChatCompletionModel;
+import org.elasticsearch.xpack.inference.services.googlevertexai.completion.GoogleVertexAiCompletionModel;
 import org.elasticsearch.xpack.inference.services.googlevertexai.embeddings.GoogleVertexAiEmbeddingsModel;
 import org.elasticsearch.xpack.inference.services.googlevertexai.rerank.GoogleVertexAiRerankModel;
 
@@ -21,4 +22,6 @@ public interface GoogleVertexAiActionVisitor {
     ExecutableAction create(GoogleVertexAiRerankModel model, Map<String, Object> taskSettings);
 
     ExecutableAction create(GoogleVertexAiChatCompletionModel model, Map<String, Object> taskSettings);
+
+    ExecutableAction create(GoogleVertexAiCompletionModel model, Map<String, Object> taskSettings);
 }
diff --git a/...rch/xpack/inference/services/googlevertexai/completion/GoogleVertexAiCompletionModel.java b/...rch/xpack/inference/services/googlevertexai/completion/GoogleVertexAiCompletionModel.java
@@ -9,7 +9,9 @@
 
 import org.apache.http.client.utils.URIBuilder;
 import org.elasticsearch.inference.TaskType;
+import org.elasticsearch.xpack.inference.external.action.ExecutableAction;
 import org.elasticsearch.xpack.inference.services.ConfigurationParseContext;
+import org.elasticsearch.xpack.inference.services.googlevertexai.action.GoogleVertexAiActionVisitor;
 import org.elasticsearch.xpack.inference.services.googlevertexai.request.GoogleVertexAiUtils;
 
 import java.net.URI;
@@ -39,6 +41,25 @@ public GoogleVertexAiCompletionModel(
 
     }
 
+    public void updateUri(boolean isStream) throws URISyntaxException {
+        var location = getServiceSettings().location();
+        var projectId = getServiceSettings().projectId();
+        var model = getServiceSettings().modelId();
+
+        // Google VertexAI generates streaming response using another API. We call this
+        // method before making the request to be sure we are calling the right API
+        if (isStream) {
+            this.uri = GoogleVertexAiChatCompletionModel.buildUri(location, projectId, model);
+        } else {
+            this.uri = GoogleVertexAiCompletionModel.buildUri(location, projectId, model);
+        }
+    }
+
+    @Override
+    public ExecutableAction accept(GoogleVertexAiActionVisitor visitor, Map<String, Object> taskSettings) {
+        return visitor.create(this, taskSettings);
+    }
+
     public static URI buildUri(String location, String projectId, String model) throws URISyntaxException {
         return new URIBuilder().setScheme("https")
             .setHost(format("%s%s", location, GoogleVertexAiUtils.GOOGLE_VERTEX_AI_HOST_SUFFIX))

diff --git a/...ck/inference/services/googlevertexai/response/GoogleVertexAiCompletionResponseEntity.java b/...ck/inference/services/googlevertexai/response/GoogleVertexAiCompletionResponseEntity.java
@@ -21,6 +21,8 @@
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 
+import static org.elasticsearch.xpack.inference.external.response.XContentUtils.moveToFirstToken;
+
 public class GoogleVertexAiCompletionResponseEntity {
     /**
      * Parses the response from Google Vertex AI's generateContent endpoint
@@ -91,7 +93,7 @@ public static InferenceServiceResults fromResponse(Request request, HttpResult r
             XContentParser parser = XContentFactory.xContent(XContentType.JSON)
                 .createParser(XContentParserConfiguration.EMPTY, responseJson)
         ) {
-            parser.nextToken();
+            moveToFirstToken(parser);
             chunk = GoogleVertexAiUnifiedStreamingProcessor.GoogleVertexAiChatCompletionChunkParser.parse(parser);
         }
         var results = chunk.choices().stream().map(choice -> choice.delta().content()).map(ChatCompletionResults.Result::new).toList();