diff --git a/clients/python/llmengine/data_types/model_endpoints.py b/clients/python/llmengine/data_types/model_endpoints.py index 2e0877732..d6f23aa19 100644 --- a/clients/python/llmengine/data_types/model_endpoints.py +++ b/clients/python/llmengine/data_types/model_endpoints.py @@ -64,6 +64,21 @@ class CreateLLMEndpointRequest(VLLMEndpointAdditionalArgs, BaseModel): default=None, description="A Jinja template to use for this endpoint. If not provided, will use the chat template from the checkpoint", ) + # Route configuration for multiple endpoints support + routes: Optional[List[str]] = Field( + default=None, + description="List of additional routes to forward to the user's service. " + "These routes will be added alongside the default /predict route. " + "Requires passthrough forwarder type.", + ) + extra_routes: Optional[List[str]] = Field( + default=None, description="Legacy field for additional routes. Use 'routes' instead." + ) + forwarder_type: Optional[str] = Field( + default=None, + description="Type of forwarder to use. Set to 'passthrough' to enable " + "multiple route forwarding to your FastAPI service.", + ) class CreateLLMEndpointResponse(BaseModel): diff --git a/examples/multi_route_client_example.py b/examples/multi_route_client_example.py new file mode 100644 index 000000000..56caa5c4b --- /dev/null +++ b/examples/multi_route_client_example.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +""" +Example demonstrating how to deploy a multi-route FastAPI server using Launch. + +This example shows how to use the new route configuration parameters to deploy +a FastAPI server with multiple endpoints that can be accessed through their +natural paths rather than being restricted to just /predict. +""" + +import time + +import requests +from llmengine import Model +from llmengine.data_types.core import ModelEndpointType +from llmengine.data_types.model_endpoints import CreateLLMEndpointRequest + + +def create_multi_route_endpoint(): + """ + Create a model endpoint with multiple routes using the new passthrough forwarder. + """ + + # Define the routes we want to expose from our FastAPI server + custom_routes = [ + "/v1/chat/completions", # OpenAI-compatible chat endpoint + "/v1/completions", # OpenAI-compatible completions endpoint + "/analyze", # Custom analysis endpoint + "/custom/endpoint", # Custom GET endpoint + "/batch/process", # Batch processing endpoint + ] + + print("Creating model endpoint with multiple routes...") + print(f"Routes to be exposed: {custom_routes}") + + # Create the endpoint with multi-route support + response = Model.create( + name="multi-route-fastapi-example", + model="llama-2-7b", # This is just for the bundle creation, our custom server will handle the logic + inference_framework_image_tag="latest", + # Hardware configuration + cpus=4, + memory="8Gi", + storage="20Gi", + gpus=1, + gpu_type="nvidia-ampere-a10", + # Scaling configuration + min_workers=1, + max_workers=3, + per_worker=10, + endpoint_type=ModelEndpointType.STREAMING, + # NEW: Multi-route configuration + routes=custom_routes, # List of routes to forward + forwarder_type="passthrough", # Enable passthrough forwarding + # Other settings + public_inference=False, + labels={"example": "multi-route", "type": "fastapi"}, + ) + + print(f"Endpoint created! Task ID: {response.endpoint_creation_task_id}") + return response.endpoint_creation_task_id + + +def test_multi_route_endpoint(endpoint_name: str, base_url: str): + """ + Test the multi-route endpoint by making requests to different routes. + """ + print(f"\nTesting multi-route endpoint: {endpoint_name}") + print(f"Base URL: {base_url}") + + # Test cases for different routes + test_cases = [ + { + "name": "Traditional Predict", + "method": "POST", + "url": f"{base_url}/predict", + "data": {"text": "Hello world", "model": "custom"}, + }, + { + "name": "OpenAI Chat Completions", + "method": "POST", + "url": f"{base_url}/v1/chat/completions", + "data": { + "messages": [{"role": "user", "content": "Hello, how are you?"}], + "model": "gpt-3.5-turbo", + "max_tokens": 50, + }, + }, + { + "name": "OpenAI Completions", + "method": "POST", + "url": f"{base_url}/v1/completions", + "data": { + "prompt": "The future of AI is", + "model": "text-davinci-003", + "max_tokens": 50, + }, + }, + { + "name": "Custom Analysis", + "method": "POST", + "url": f"{base_url}/analyze", + "data": {"text": "This is a good example of multi-route functionality"}, + }, + { + "name": "Custom GET Endpoint", + "method": "GET", + "url": f"{base_url}/custom/endpoint", + "data": None, + }, + { + "name": "Batch Processing", + "method": "POST", + "url": f"{base_url}/batch/process", + "data": {"texts": ["First text", "Second text", "Third text"]}, + }, + ] + + # Execute test cases + for test_case in test_cases: + print(f"\n--- Testing {test_case['name']} ---") + print(f"URL: {test_case['url']}") + + try: + if test_case["method"] == "GET": + response = requests.get(test_case["url"]) + else: + response = requests.post(test_case["url"], json=test_case["data"]) + + print(f"Status: {response.status_code}") + if response.status_code == 200: + result = response.json() + print(f"Response: {result}") + else: + print(f"Error: {response.text}") + + except requests.exceptions.RequestException as e: + print(f"Request failed: {e}") + + +def main(): + """ + Main example workflow. + """ + + print("=" * 60) + print("Launch Multi-Route FastAPI Server Example") + print("=" * 60) + + print( + """\ +This example demonstrates the new multi-route passthrough functionality in Launch. + +Instead of being limited to a single /predict endpoint, you can now: +1. Specify multiple routes to be forwarded to your FastAPI server +2. Use the passthrough forwarder type to enable full HTTP method support +3. Access your endpoints through their natural paths + +Key benefits: +- No more single endpoint limitation +- Full FastAPI server compatibility +- Support for GET, POST, PUT, DELETE, PATCH, HEAD, OPTIONS +- OpenAI-compatible endpoints alongside custom routes +- Easy migration of existing FastAPI applications +""" + ) + + # Step 1: Create the multi-route endpoint + task_id = create_multi_route_endpoint() + + print(f"\nEndpoint creation initiated with task ID: {task_id}") + print("Waiting for endpoint to be ready...") + + # In a real scenario, you would poll the endpoint status + # For this example, we'll simulate waiting + print("⏳ Endpoint is being deployed...") + print("⏳ This may take several minutes...") + + # Step 2: Once ready, test the endpoints + # Note: In practice, you'd get the actual endpoint URL from the Launch API + endpoint_name = "multi-route-fastapi-example" + base_url = f"https://your-launch-domain.com/v1/endpoints/{endpoint_name}" + + print(f"\n✅ Endpoint ready! You can now test it at: {base_url}") + print("\nExample test calls you can make:") + + # Show example curl commands + curl_examples = [ + { + "name": "Traditional predict", + "cmd": f'curl -X POST {base_url}/predict -H "Content-Type: application/json" -d \'{{"text": "Hello world", "model": "custom"}}\'', + }, + { + "name": "OpenAI chat", + "cmd": f'curl -X POST {base_url}/v1/chat/completions -H "Content-Type: application/json" -d \'{{"messages": [{{"role": "user", "content": "Hello!"}}], "model": "gpt-3.5-turbo"}}\'', + }, + { + "name": "Custom analysis", + "cmd": f'curl -X POST {base_url}/analyze -H "Content-Type: application/json" -d \'{{"text": "This is amazing!"}}\'', + }, + {"name": "Custom GET endpoint", "cmd": f"curl -X GET {base_url}/custom/endpoint"}, + ] + + for example in curl_examples: + print(f"\n{example['name']}:") + print(f" {example['cmd']}") + + print(f"\n" + "=" * 60) + print("Multi-Route Support Successfully Configured!") + print("=" * 60) + + # Uncomment the following line to run actual tests if you have a deployed endpoint + # test_multi_route_endpoint(endpoint_name, base_url) + + +if __name__ == "__main__": + main() diff --git a/examples/multi_route_fastapi_server.py b/examples/multi_route_fastapi_server.py new file mode 100644 index 000000000..0f0ad0e7d --- /dev/null +++ b/examples/multi_route_fastapi_server.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +""" +End-to-end example demonstrating multiple routes passthrough in Launch. + +This example shows how to create a FastAPI server with multiple routes and deploy it +using Launch's model endpoint creation with the passthrough forwarder. + +The server implements several endpoints that would normally require the single /predict +restriction, but now can be accessed through their natural paths. +""" + +from typing import Any, Dict, List, Optional + +import uvicorn +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel + +# FastAPI server with multiple routes +app = FastAPI(title="Multi-Route Example Server", version="1.0.0") + + +# Data models +class PredictRequest(BaseModel): + text: str + model: Optional[str] = "default" + + +class PredictResponse(BaseModel): + result: str + model: str + route: str + + +class HealthResponse(BaseModel): + status: str + routes: List[str] + + +class ChatMessage(BaseModel): + role: str + content: str + + +class ChatRequest(BaseModel): + messages: List[ChatMessage] + model: Optional[str] = "gpt-3.5-turbo" + max_tokens: Optional[int] = 100 + + +class ChatResponse(BaseModel): + choices: List[Dict[str, Any]] + model: str + usage: Dict[str, int] + + +class CompletionRequest(BaseModel): + prompt: str + model: Optional[str] = "text-davinci-003" + max_tokens: Optional[int] = 100 + + +class CompletionResponse(BaseModel): + choices: List[Dict[str, str]] + model: str + usage: Dict[str, int] + + +# Health check endpoint (required by Launch) +@app.get("/health", response_model=HealthResponse) +@app.get("/readyz", response_model=HealthResponse) +def health_check(): + """Health check endpoint required by Launch forwarder.""" + return HealthResponse( + status="healthy", + routes=[ + "/predict", + "/v1/chat/completions", + "/v1/completions", + "/analyze", + "/custom/endpoint", + ], + ) + + +# Traditional predict endpoint +@app.post("/predict", response_model=PredictResponse) +def predict(request: PredictRequest): + """Traditional ML prediction endpoint.""" + return PredictResponse( + result=f"Processed text: {request.text}", model=request.model, route="/predict" + ) + + +# OpenAI-compatible chat completions endpoint +@app.post("/v1/chat/completions", response_model=ChatResponse) +def chat_completions(request: ChatRequest): + """OpenAI-compatible chat completions endpoint.""" + # Simple echo implementation for example + last_message = ( + request.messages[-1] if request.messages else ChatMessage(role="user", content="") + ) + + return ChatResponse( + choices=[ + { + "message": {"role": "assistant", "content": f"Echo: {last_message.content}"}, + "finish_reason": "stop", + "index": 0, + } + ], + model=request.model, + usage={ + "prompt_tokens": len(last_message.content.split()), + "completion_tokens": len(last_message.content.split()) + 1, + "total_tokens": len(last_message.content.split()) * 2 + 1, + }, + ) + + +# OpenAI-compatible completions endpoint +@app.post("/v1/completions", response_model=CompletionResponse) +def completions(request: CompletionRequest): + """OpenAI-compatible completions endpoint.""" + return CompletionResponse( + choices=[ + {"text": f" -> Completion for: {request.prompt}", "finish_reason": "stop", "index": 0} + ], + model=request.model, + usage={ + "prompt_tokens": len(request.prompt.split()), + "completion_tokens": 10, + "total_tokens": len(request.prompt.split()) + 10, + }, + ) + + +# Custom analysis endpoint +@app.post("/analyze") +def analyze_text(data: Dict[str, Any]): + """Custom text analysis endpoint.""" + text = data.get("text", "") + if not text: + raise HTTPException(status_code=400, detail="Text field is required") + + return { + "analysis": { + "word_count": len(text.split()), + "char_count": len(text), + "sentiment": "positive" if "good" in text.lower() else "neutral", + }, + "text": text, + "route": "/analyze", + } + + +# Another custom endpoint +@app.get("/custom/endpoint") +def custom_endpoint(): + """A custom GET endpoint to demonstrate method flexibility.""" + return { + "message": "This is a custom endpoint accessible via passthrough routing", + "methods_supported": ["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS"], + "route": "/custom/endpoint", + } + + +# Batch processing endpoint +@app.post("/batch/process") +def batch_process(data: Dict[str, List[str]]): + """Batch processing endpoint for multiple texts.""" + texts = data.get("texts", []) + return { + "results": [f"Processed: {text}" for text in texts], + "count": len(texts), + "route": "/batch/process", + } + + +if __name__ == "__main__": + # Run the server + uvicorn.run(app, host="0.0.0.0", port=5005) diff --git a/model-engine/model_engine_server/common/dtos/model_endpoints.py b/model-engine/model_engine_server/common/dtos/model_endpoints.py index 36a7c7f68..18d0aa66f 100644 --- a/model-engine/model_engine_server/common/dtos/model_endpoints.py +++ b/model-engine/model_engine_server/common/dtos/model_endpoints.py @@ -73,6 +73,21 @@ class CreateModelEndpointV1Request(BaseModel): default_callback_url: Optional[HttpUrlStr] = None default_callback_auth: Optional[CallbackAuth] = None public_inference: Optional[bool] = Field(default=False) + # Route configuration for multiple endpoints support + routes: Optional[List[str]] = Field( + default=None, + description="List of additional routes to forward to the user's service. " + "These routes will be added alongside the default /predict route. " + "Requires passthrough forwarder type.", + ) + extra_routes: Optional[List[str]] = Field( + default=None, description="Legacy field for additional routes. Use 'routes' instead." + ) + forwarder_type: Optional[str] = Field( + default=None, + description="Type of forwarder to use. Set to 'passthrough' to enable " + "multiple route forwarding to your FastAPI service.", + ) class CreateModelEndpointV1Response(BaseModel): diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 352b7a060..155a027d3 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -392,6 +392,10 @@ async def execute( chat_template_override: Optional[str], nodes_per_worker: int, additional_args: Optional[Dict[str, Any]] = None, + # Route configuration for multiple endpoints support + routes: Optional[List[str]] = None, + extra_routes: Optional[List[str]] = None, + forwarder_type: Optional[str] = None, ) -> ModelBundle: multinode = nodes_per_worker > 1 if source != LLMSource.HUGGING_FACE: @@ -459,6 +463,9 @@ async def execute( checkpoint_path, chat_template_override, additional_args=additional_vllm_args, + routes=routes, + extra_routes=extra_routes, + forwarder_type=forwarder_type, ) else: bundle_id = await self.create_vllm_bundle( @@ -471,6 +478,9 @@ async def execute( checkpoint_path, chat_template_override, additional_args=additional_vllm_args, + routes=routes, + extra_routes=extra_routes, + forwarder_type=forwarder_type, ) case LLMInferenceFramework.SGLANG: # pragma: no cover if not hmi_config.sglang_repository: @@ -991,6 +1001,9 @@ async def create_vllm_bundle( checkpoint_path: Optional[str], chat_template_override: Optional[str], additional_args: Optional[VLLMEndpointAdditionalArgs] = None, + routes: Optional[List[str]] = None, + extra_routes: Optional[List[str]] = None, + forwarder_type: Optional[str] = None, ): command = self._create_vllm_bundle_command( model_name, @@ -1005,6 +1018,20 @@ async def create_vllm_bundle( additional_args=additional_args, ) + # Determine which routes to use - user-provided or defaults + final_routes = [] + final_extra_routes = [] + final_forwarder_type = forwarder_type + + if routes is not None: + final_routes = routes + else: + # Default to OpenAI compatibility routes for VLLM + final_routes = [OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH] + + if extra_routes is not None: + final_extra_routes = extra_routes + create_model_bundle_v2_request = CreateModelBundleV2Request( name=endpoint_unique_name, schema_location="TBA", @@ -1019,10 +1046,9 @@ async def create_vllm_bundle( healthcheck_route="/health", predict_route="/predict", streaming_predict_route="/stream", - routes=[ - OPENAI_CHAT_COMPLETION_PATH, - OPENAI_COMPLETION_PATH, - ], + routes=final_routes, + extra_routes=final_extra_routes, + forwarder_type=final_forwarder_type, env={}, ), metadata={}, @@ -1051,6 +1077,9 @@ async def create_vllm_multinode_bundle( checkpoint_path: Optional[str], chat_template_override: Optional[str], additional_args: Optional[VLLMEndpointAdditionalArgs] = None, + routes: Optional[List[str]] = None, + extra_routes: Optional[List[str]] = None, + forwarder_type: Optional[str] = None, ): leader_command = self._create_vllm_bundle_command( model_name, @@ -1087,6 +1116,20 @@ async def create_vllm_multinode_bundle( "RAY_CLUSTER_SIZE": "$(K8S_LWS_CLUSTER_SIZE)", } + # Determine which routes to use - user-provided or defaults + final_routes = [] + final_extra_routes = [] + final_forwarder_type = forwarder_type + + if routes is not None: + final_routes = routes + else: + # Default to OpenAI compatibility routes for VLLM + final_routes = [OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH] + + if extra_routes is not None: + final_extra_routes = extra_routes + create_model_bundle_v2_request = CreateModelBundleV2Request( name=endpoint_unique_name, schema_location="TBA", @@ -1101,7 +1144,9 @@ async def create_vllm_multinode_bundle( healthcheck_route="/health", predict_route="/predict", streaming_predict_route="/stream", - routes=[OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH], + routes=final_routes, + extra_routes=final_extra_routes, + forwarder_type=final_forwarder_type, env=common_vllm_envs, worker_command=worker_command, worker_env=common_vllm_envs, @@ -1343,6 +1388,10 @@ async def execute( chat_template_override=request.chat_template_override, nodes_per_worker=request.nodes_per_worker, additional_args=request.model_dump(exclude_none=True), + # Pass route configuration to bundle creation + routes=request.routes, + extra_routes=request.extra_routes, + forwarder_type=request.forwarder_type, ) validate_resource_requests( bundle=bundle,