deepjavalibrary
diff --git a/‎.github/workflows/integration.yml‎
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/integration.yml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py‎
Lines changed: 54 additions & 2 deletions b/‎engines/python/setup/djl_python/lmi_vllm/vllm_async_service.py‎
Lines changed: 54 additions & 2 deletions
diff --git a/‎engines/python/src/main/java/ai/djl/python/engine/PyProcess.java‎
Lines changed: 2 additions & 1 deletion b/‎engines/python/src/main/java/ai/djl/python/engine/PyProcess.java‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎serving/src/main/java/ai/djl/serving/http/ManagementRequestHandler.java‎
Lines changed: 148 additions & 2 deletions b/‎serving/src/main/java/ai/djl/serving/http/ManagementRequestHandler.java‎
Lines changed: 148 additions & 2 deletions
diff --git a/‎serving/src/main/java/ai/djl/serving/sessions/PySessionManager.java‎
Lines changed: 40 additions & 0 deletions b/‎serving/src/main/java/ai/djl/serving/sessions/PySessionManager.java‎
Lines changed: 40 additions & 0 deletions
@@ -170,7 +170,9 @@ jobs:
           - test: TestCorrectnessTrtLlm
             instance: g6
             failure-prefix: trtllm
-
+          - test: TestStickyRouting
+            instance: g6
+            failure-prefix: lmi
     outputs:
       failure_cpu: ${{ steps.test-failure.outputs.failure_cpu }}
       failure_gpu: ${{ steps.test-failure.outputs.failure_gpu }}
@@ -268,4 +270,4 @@ jobs:
           ./stop_instance.sh $instance_id
 
           instance_id=${{ needs.create-runners.outputs.cpu_instance_id }}
-          ./stop_instance.sh $instance_id
+          ./stop_instance.sh $instance_id
@@ -30,8 +30,8 @@
 from djl_python.inputs import Input
 from djl_python.outputs import Output
 from djl_python.encode_decode import decode
-from djl_python.async_utils import handle_streaming_response, create_non_stream_output
-from djl_python.custom_formatter_handling import CustomFormatterHandler, CustomFormatterError
+from djl_python.async_utils import handle_streaming_response, create_non_stream_output, ProcessedRequest
+from djl_python.session_manager import SessionManager
 
 from .request_response_utils import (
     ProcessedRequest,
@@ -61,6 +61,7 @@ def __init__(self):
         self.vllm_engine_args = None
         self.vllm_properties = None
         self.model_name = None
+        self.session_manager = None
         self.initialized = False
 
     async def initialize(self, properties: dict):
@@ -119,12 +120,20 @@ async def initialize(self, properties: dict):
             tool_parser=self.vllm_properties.tool_call_parser,
             reasoning_parser=self.vllm_properties.reasoning_parser,
         )
+        self.session_manager: SessionManager = SessionManager(properties)
         self.initialized = True
 
     def preprocess_request(self, inputs: Input) -> ProcessedRequest:
         batch = inputs.get_batches()
         assert len(batch) == 1, "only one request per batch allowed"
         raw_request = batch[0]
+
+        # Get session id
+        session_id = raw_request.get_property("X-Amzn-SageMaker-Session-Id")
+        session = self.session_manager.get_session(session_id)
+        if session is None:
+            raise RuntimeError(f"Requested session {session_id} not found")
+
         content_type = raw_request.get_property("Content-Type")
         decoded_payload = decode(raw_request, content_type)
 
@@ -222,10 +231,53 @@ async def inference(
             tokenizer=self.tokenizer,
         )
 
+    async def create_session(self, inputs: Input):
+        await self.check_health()
+        outputs = Output()
+        session = self.session_manager.create_session()
+        outputs.add_property("X-Amzn-SageMaker-Session-Id", session.session_id)
+        outputs.add_property("Content-Type", "application/json")
+        outputs.add(Output.binary_encode(
+            {"result": f"Session {session.session_id} created"}),
+                    key="result")
+        logger.info(f"Session {session.session_id} created")
+        return outputs
+
+    async def close_session(self, inputs: Input):
+        await self.check_health()
+        outputs = Output()
+        session_id = inputs.get_property("X-Amzn-SageMaker-Session-Id")
+        self.session_manager.close_session(session_id)
+        outputs.add_property("X-Amzn-SageMaker-Session-Closed", "true")
+        outputs.add_property("Content-Type", "application/json")
+        outputs.add(Output.binary_encode(
+            {"result": f"Session {session_id} closed"}),
+                    key="result")
+        logger.info(f"Session {session_id} closed")
+        return outputs
+
 
 service = VLLMHandler()
 
 
+async def create_session(inputs: Input) -> Output:
+    if not service.initialized:
+        await service.initialize(inputs.get_properties())
+        logger.info("vllm service initialized")
+
+    outputs = await service.create_session(inputs)
+    return outputs
+
+
+async def close_session(inputs: Input) -> Output:
+    if not service.initialized:
+        await service.initialize(inputs.get_properties())
+        logger.info("vllm service initialized")
+
+    outputs = await service.create_session(inputs)
+    return outputs
+
+
 async def handle(
         inputs: Input
 ) -> Optional[Union[Output, AsyncGenerator[Output, None]]]:
 
@@ -158,7 +158,8 @@ Output predict(Input inputs, int timeout, boolean initialLoad) throws TranslateE
         // In RollingBatch, we queue adapter loading jobs to occur after the initial load.
         // Executing those in RollingBatch context doesn't work, so we need to handle them in the
         // 'standard' way.
-        if (initialLoad || inputs.getProperty("handler", null) != null) {
+        if (initialLoad
+                || (inputs.getProperty("handler", null) != null && asyncRequestManager == null)) {
             return predictStandard(inputs, timeout, initialLoad);
         }
         if (rollingBatch != null) {
 
@@ -21,13 +21,20 @@
 import ai.djl.serving.http.list.ListWorkflowsResponse;
 import ai.djl.serving.models.Endpoint;
 import ai.djl.serving.models.ModelManager;
+import ai.djl.serving.sessions.SessionManager;
+import ai.djl.serving.util.ConfigManager;
 import ai.djl.serving.util.NettyUtils;
 import ai.djl.serving.wlm.ModelInfo;
+import ai.djl.serving.wlm.WorkLoadManager;
+import ai.djl.serving.wlm.WorkerPool;
 import ai.djl.serving.wlm.WorkerPoolConfig;
+import ai.djl.serving.wlm.util.WlmCapacityException;
+import ai.djl.serving.wlm.util.WlmException;
 import ai.djl.serving.workflow.BadWorkflowException;
 import ai.djl.serving.workflow.Workflow;
 import ai.djl.serving.workflow.WorkflowDefinition;
 import ai.djl.serving.workflow.WorkflowTemplates;
+import ai.djl.translate.TranslateException;
 import ai.djl.util.JsonUtils;
 import ai.djl.util.Pair;
 
@@ -40,32 +47,41 @@
 import io.netty.handler.codec.http.QueryStringDecoder;
 import io.netty.util.CharsetUtil;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import java.io.IOException;
 import java.lang.reflect.Method;
 import java.net.URI;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
+import java.util.NoSuchElementException;
 import java.util.concurrent.CompletableFuture;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
 /** A class handling inbound HTTP requests to the management API. */
 public class ManagementRequestHandler extends HttpRequestHandler {
 
+    private static final Logger logger = LoggerFactory.getLogger(ManagementRequestHandler.class);
+
     private static final Pattern WORKFLOWS_PATTERN = Pattern.compile("^/workflows([/?].*)?");
     private static final Pattern MODELS_PATTERN = Pattern.compile("^/models([/?].*)?");
     private static final Pattern INVOKE_PATTERN = Pattern.compile("^/models/.+/invoke$");
     private static final Pattern SERVER_PATTERN = Pattern.compile("^/server/.+");
+    private static final Pattern SESSION_PATTERN = Pattern.compile("^/(create|close)_session");
 
     /** {@inheritDoc} */
     @Override
     public boolean acceptInboundMessage(Object msg) throws Exception {
         if (super.acceptInboundMessage(msg)) {
             FullHttpRequest req = (FullHttpRequest) msg;
             String uri = req.uri();
-            if (WORKFLOWS_PATTERN.matcher(uri).matches() || SERVER_PATTERN.matcher(uri).matches()) {
+            if (WORKFLOWS_PATTERN.matcher(uri).matches()
+                    || SERVER_PATTERN.matcher(uri).matches()
+                    || SESSION_PATTERN.matcher(uri).matches()) {
                 return true;
             } else if (AdapterManagementRequestHandler.ADAPTERS_PATTERN.matcher(uri).matches()) {
                 return false;
@@ -107,7 +123,11 @@ protected void handleRequest(
                 }
                 return;
             } else if (HttpMethod.POST.equals(method)) {
-                if ("models".equals(segments[1])) {
+                if ("create_session".equals(segments[1])) {
+                    handleCreateSession(ctx);
+                } else if ("close_session".equals(segments[1])) {
+                    handleCloseSession(ctx, req);
+                } else if ("models".equals(segments[1])) {
                     handleRegisterModel(ctx, req, decoder);
                 } else {
                     handleRegisterWorkflow(ctx, decoder);
@@ -384,6 +404,95 @@ private void handleScaleWorkflow(
         }
     }
 
+    private void handleCreateSession(final ChannelHandlerContext ctx) {
+        WorkLoadManager wlm = ModelManager.getInstance().getWorkLoadManager();
+        String modelName =
+                ModelManager.getInstance()
+                        .getSingleStartupWorkflow()
+                        .orElseThrow(
+                                () ->
+                                        new BadRequestException(
+                                                "there should be only a single startup"
+                                                        + " model used."));
+        WorkerPool<Input, Output> wp = wlm.getWorkerPoolById(modelName);
+        if (wp == null) {
+            throw new BadRequestException(
+                    HttpResponseStatus.NOT_FOUND.code(),
+                    "The model " + modelName + " was not found");
+        }
+        ModelInfo<Input, Output> modelInfo = getModelInfo(wp);
+
+        SessionManager<Input, Output> sessionManager = SessionManager.newInstance(modelInfo);
+        sessionManager
+                .createSession(wlm)
+                .whenCompleteAsync(
+                        (o, t) -> {
+                            if (o != null) {
+                                if (o.getCode() >= 300) {
+                                    throw new BadRequestException(o.getCode(), o.getMessage());
+                                }
+                                NettyUtils.sendJsonResponse(
+                                        ctx,
+                                        new StatusResponse(o.getMessage()),
+                                        HttpResponseStatus.valueOf(o.getCode()));
+                            }
+                        })
+                .exceptionally(
+                        t -> {
+                            onException(t.getCause(), ctx);
+                            return null;
+                        });
+    }
+
+    private void handleCloseSession(final ChannelHandlerContext ctx, FullHttpRequest req) {
+        WorkLoadManager wlm = ModelManager.getInstance().getWorkLoadManager();
+        String modelName =
+                ModelManager.getInstance()
+                        .getSingleStartupWorkflow()
+                        .orElseThrow(
+                                () ->
+                                        new BadRequestException(
+                                                "there should be only a single startup"
+                                                        + " model used."));
+        WorkerPool<Input, Output> wp = wlm.getWorkerPoolById(modelName);
+        if (wp == null) {
+            throw new BadRequestException(
+                    HttpResponseStatus.NOT_FOUND.code(),
+                    "The model " + modelName + " was not found");
+        }
+        ModelInfo<Input, Output> modelInfo = getModelInfo(wp);
+        String sessionId = req.headers().get("X-Amzn-SageMaker-Session-Id");
+
+        SessionManager<Input, Output> sessionManager = SessionManager.newInstance(modelInfo);
+        sessionManager
+                .closeSession(wlm, sessionId)
+                .whenCompleteAsync(
+                        (o, t) -> {
+                            if (o != null) {
+                                if (o.getCode() >= 300) {
+                                    throw new BadRequestException(o.getCode(), o.getMessage());
+                                }
+                                NettyUtils.sendJsonResponse(
+                                        ctx,
+                                        new StatusResponse(o.getMessage()),
+                                        HttpResponseStatus.valueOf(o.getCode()));
+                            }
+                        })
+                .exceptionally(
+                        t -> {
+                            onException(t.getCause(), ctx);
+                            return null;
+                        });
+    }
+
+    private ModelInfo<Input, Output> getModelInfo(WorkerPool<Input, Output> wp) {
+        if (!(wp.getWpc() instanceof ModelInfo)) {
+            String modelName = wp.getWpc().getId();
+            throw new BadRequestException("The worker " + modelName + " is not a model");
+        }
+        return (ModelInfo<Input, Output>) wp.getWpc();
+    }
+
     @SuppressWarnings("unchecked")
     private void handleConfigLogs(ChannelHandlerContext ctx, QueryStringDecoder decoder) {
         String logLevel = NettyUtils.getParameter(decoder, "level", null);
@@ -408,4 +517,41 @@ private void handleConfigLogs(ChannelHandlerContext ctx, QueryStringDecoder deco
         StatusResponse resp = new StatusResponse("OK");
         NettyUtils.sendJsonResponse(ctx, resp);
     }
+
+    private void onException(Throwable t, ChannelHandlerContext ctx) {
+        ConfigManager config = ConfigManager.getInstance();
+        int code;
+        String requestIdLogPrefix = "";
+        if (ctx != null) {
+            String requestId = NettyUtils.getRequestId(ctx.channel());
+            requestIdLogPrefix = "RequestId=[" + requestId + "]: ";
+        }
+        if (t instanceof TranslateException) {
+            logger.debug("{}{}", requestIdLogPrefix, t.getMessage(), t);
+            code = config.getBadRequestErrorHttpCode();
+        } else if (t instanceof BadRequestException) {
+            code = ((BadRequestException) t).getCode();
+        } else if (t instanceof WlmException) {
+            logger.warn("{}{}", requestIdLogPrefix, t.getMessage(), t);
+            if (t instanceof WlmCapacityException) {
+                code = config.getThrottleErrorHttpCode();
+            } else {
+                code = config.getWlmErrorHttpCode();
+            }
+        } else if (t instanceof NoSuchElementException) {
+            logger.warn(requestIdLogPrefix, t);
+            code = HttpResponseStatus.NOT_FOUND.code();
+        } else if (t instanceof IllegalArgumentException) {
+            logger.warn(requestIdLogPrefix, t);
+            code = HttpResponseStatus.CONFLICT.code();
+        } else {
+            logger.warn("{} Unexpected error", requestIdLogPrefix, t);
+            code = config.getServerErrorHttpCode();
+        }
+        HttpResponseStatus status = HttpResponseStatus.valueOf(code);
+
+        if (ctx != null) {
+            NettyUtils.sendError(ctx, status, t);
+        }
+    }
 }
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+ * with the License. A copy of the License is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+ * OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+ * and limitations under the License.
+ */
+package ai.djl.serving.sessions;
+
+import ai.djl.modality.Input;
+import ai.djl.modality.Output;
+import ai.djl.serving.wlm.ModelInfo;
+
+/** An overload of {@link SessionManager} for the python engine. */
+public class PySessionManager extends SessionManager<Input, Output> {
+
+    protected PySessionManager(ModelInfo<Input, Output> modelInfo) {
+        super(modelInfo);
+    }
+
+    @Override
+    protected Input getCreateSessionInput() {
+        Input input = new Input();
+        input.addProperty("handler", "create_session");
+        return input;
+    }
+
+    @Override
+    protected Input getCloseSessionInput(String sessionId) {
+        Input input = new Input();
+        input.addProperty("handler", "close_session");
+        input.addProperty("X-Amzn-SageMaker-Session-Id", sessionId);
+        return input;
+    }
+}