From 23cbb91584b09bbea76fb453d7858081c00a702b Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Mon, 8 Sep 2025 18:05:33 +0800 Subject: [PATCH 1/3] Update README.md --- optillm/plugins/proxy/README.md | 96 ++++++++------------------------- 1 file changed, 21 insertions(+), 75 deletions(-) diff --git a/optillm/plugins/proxy/README.md b/optillm/plugins/proxy/README.md index 71424db4..d7005f09 100644 --- a/optillm/plugins/proxy/README.md +++ b/optillm/plugins/proxy/README.md @@ -48,82 +48,36 @@ routing: ### 2. Start OptiLLM Server ```bash -# Option A: Use proxy as default for ALL requests (recommended) -optillm --approach proxy - -# Option B: Start server normally (requires model prefix or extra_body) +# Start server normally optillm # With custom port -optillm --approach proxy --port 8000 +optillm --port 8000 ``` +> **Note**: The `--approach proxy` flag is not currently supported. Use the model prefix method below. + ### 3. Usage Examples -#### When using `--approach proxy` (Recommended) +#### Using Model Prefix (Currently the only working method) ```bash -# No need for "proxy-" prefix! The proxy handles all requests automatically +# Use "proxy-" prefix to activate the proxy plugin curl -X POST http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "gpt-4", + "model": "proxy-gpt-4", "messages": [{"role": "user", "content": "Hello"}] }' # The proxy will: # 1. Route to one of your configured providers -# 2. Apply model mapping if configured +# 2. Apply model mapping if configured # 3. Handle failover automatically ``` -#### Without `--approach proxy` flag -```bash -# Method 1: Use model prefix -curl -X POST http://localhost:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "proxy-gpt-4", - "messages": [{"role": "user", "content": "Hello"}] - }' - -# Method 2: Use extra_body -curl -X POST http://localhost:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4", - "messages": [{"role": "user", "content": "Hello"}], - "extra_body": { - "optillm_approach": "proxy" - } - }' -``` - -#### Proxy with Approach/Plugin -```bash -# Use MOA approach with proxy load balancing -curl -X POST http://localhost:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4", - "messages": [{"role": "user", "content": "Solve this problem"}], - "extra_body": { - "optillm_approach": "proxy", - "proxy_wrap": "moa" - } - }' - -# Use memory plugin with proxy -curl -X POST http://localhost:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4", - "messages": [{"role": "user", "content": "Remember this"}], - "extra_body": { - "optillm_approach": "proxy", - "proxy_wrap": "memory" - } - }' -``` +> **Known Issues**: +> - `--approach proxy` flag: Not supported in command-line interface +> - `extra_body` method: Currently broken due to parsing bug in server code #### Combined Approaches ```bash @@ -136,6 +90,8 @@ curl -X POST http://localhost:8000/v1/chat/completions \ }' ``` +> **Note**: The proxy wrapping functionality (`proxy_wrap`) is currently not accessible via the working model prefix method. This would require the `extra_body` approach which is currently broken. + ## Configuration Reference ### Provider Configuration @@ -203,7 +159,7 @@ providers: ### Model-Specific Routing -When using `--approach proxy`, the proxy automatically maps model names to provider-specific deployments: +The proxy automatically maps model names to provider-specific deployments: ```yaml providers: @@ -222,9 +178,9 @@ providers: # No model_map needed - uses model names as-is ``` -With this configuration and `optillm --approach proxy`: -- Request for "gpt-4" → Azure uses "gpt-4-deployment-001", OpenAI uses "gpt-4" -- Request for "gpt-3.5-turbo" → Azure uses "gpt-35-turbo-deployment", OpenAI uses "gpt-3.5-turbo" +With this configuration and `proxy-gpt-4` model requests: +- Request for "proxy-gpt-4" → Azure uses "gpt-4-deployment-001", OpenAI uses "gpt-4" +- Request for "proxy-gpt-3.5-turbo" → Azure uses "gpt-35-turbo-deployment", OpenAI uses "gpt-3.5-turbo" ### Failover Configuration @@ -358,31 +314,21 @@ client = OpenAI( api_key="dummy" # Can be any string when using proxy ) -# If server started with --approach proxy: +# Use proxy with model prefix (currently the only working method) response = client.chat.completions.create( - model="gpt-4", # No "proxy-" prefix needed! + model="proxy-gpt-4", # Use "proxy-" prefix messages=[{"role": "user", "content": "Hello"}] ) - -# Or explicitly use proxy with another approach: -response = client.chat.completions.create( - model="gpt-4", - messages=[{"role": "user", "content": "Hello"}], - extra_body={ - "optillm_approach": "proxy", - "proxy_wrap": "moa" # Proxy will route MOA's requests - } -) ``` ### With LangChain ```python from langchain.llms import OpenAI -# If server started with --approach proxy: +# Use proxy with model prefix llm = OpenAI( openai_api_base="http://localhost:8000/v1", - model_name="gpt-4" # Proxy handles routing automatically + model_name="proxy-gpt-4" # Use "proxy-" prefix ) response = llm("What is the meaning of life?") From 4a8f10c9637733f03344592d85103dbc38381cf2 Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Mon, 8 Sep 2025 18:18:11 +0800 Subject: [PATCH 2/3] fixes --- optillm/__init__.py | 2 +- optillm/plugins/proxy/README.md | 64 +++++++++++++++++++++++++++------ optillm/plugins/proxy/client.py | 15 ++++++-- optillm/server.py | 6 ++-- pyproject.toml | 2 +- 5 files changed, 71 insertions(+), 18 deletions(-) diff --git a/optillm/__init__.py b/optillm/__init__.py index 090f917a..994b643f 100644 --- a/optillm/__init__.py +++ b/optillm/__init__.py @@ -1,5 +1,5 @@ # Version information -__version__ = "0.2.3" +__version__ = "0.2.4" # Import from server module from .server import ( diff --git a/optillm/plugins/proxy/README.md b/optillm/plugins/proxy/README.md index d7005f09..3b85e6f6 100644 --- a/optillm/plugins/proxy/README.md +++ b/optillm/plugins/proxy/README.md @@ -55,11 +55,11 @@ optillm optillm --port 8000 ``` -> **Note**: The `--approach proxy` flag is not currently supported. Use the model prefix method below. +> **Note**: The `--approach proxy` flag is not currently supported in the command-line interface. ### 3. Usage Examples -#### Using Model Prefix (Currently the only working method) +#### Method 1: Using Model Prefix ```bash # Use "proxy-" prefix to activate the proxy plugin curl -X POST http://localhost:8000/v1/chat/completions \ @@ -68,16 +68,26 @@ curl -X POST http://localhost:8000/v1/chat/completions \ "model": "proxy-gpt-4", "messages": [{"role": "user", "content": "Hello"}] }' +``` -# The proxy will: -# 1. Route to one of your configured providers -# 2. Apply model mapping if configured -# 3. Handle failover automatically +#### Method 2: Using extra_body (Recommended for SDK usage) +```bash +# Use extra_body parameter +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4", + "messages": [{"role": "user", "content": "Hello"}], + "extra_body": { + "optillm_approach": "proxy" + } + }' ``` -> **Known Issues**: -> - `--approach proxy` flag: Not supported in command-line interface -> - `extra_body` method: Currently broken due to parsing bug in server code +Both methods will: +- Route to one of your configured providers +- Apply model mapping if configured +- Handle failover automatically #### Combined Approaches ```bash @@ -90,7 +100,20 @@ curl -X POST http://localhost:8000/v1/chat/completions \ }' ``` -> **Note**: The proxy wrapping functionality (`proxy_wrap`) is currently not accessible via the working model prefix method. This would require the `extra_body` approach which is currently broken. +#### Proxy Wrapping Other Approaches +```bash +# Use proxy to wrap MOA approach +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4", + "messages": [{"role": "user", "content": "Solve this problem"}], + "extra_body": { + "optillm_approach": "proxy", + "proxy_wrap": "moa" + } + }' +``` ## Configuration Reference @@ -314,11 +337,30 @@ client = OpenAI( api_key="dummy" # Can be any string when using proxy ) -# Use proxy with model prefix (currently the only working method) +# Method 1: Use proxy with model prefix response = client.chat.completions.create( model="proxy-gpt-4", # Use "proxy-" prefix messages=[{"role": "user", "content": "Hello"}] ) + +# Method 2: Use extra_body (recommended) +response = client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": "Hello"}], + extra_body={ + "optillm_approach": "proxy" + } +) + +# Method 3: Proxy wrapping another approach +response = client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": "Hello"}], + extra_body={ + "optillm_approach": "proxy", + "proxy_wrap": "moa" + } +) ``` ### With LangChain diff --git a/optillm/plugins/proxy/client.py b/optillm/plugins/proxy/client.py index 3b5a7e0a..2b36d18c 100644 --- a/optillm/plugins/proxy/client.py +++ b/optillm/plugins/proxy/client.py @@ -114,6 +114,15 @@ class _Completions: def __init__(self, proxy_client): self.proxy_client = proxy_client + def _filter_kwargs(self, kwargs: dict) -> dict: + """Filter out OptiLLM-specific parameters that shouldn't be sent to providers""" + optillm_params = { + 'optillm_approach', 'proxy_wrap', 'wrapped_approach', 'wrap', + 'mcts_simulations', 'mcts_exploration', 'mcts_depth', + 'best_of_n', 'rstar_max_depth', 'rstar_num_rollouts', 'rstar_c' + } + return {k: v for k, v in kwargs.items() if k not in optillm_params} + def create(self, **kwargs): """Create completion with load balancing and failover""" model = kwargs.get('model', 'unknown') @@ -145,8 +154,8 @@ def create(self, **kwargs): attempted_providers.add(provider) try: - # Map model name if needed - request_kwargs = kwargs.copy() + # Map model name if needed and filter out OptiLLM-specific parameters + request_kwargs = self._filter_kwargs(kwargs.copy()) request_kwargs['model'] = provider.map_model(model) # Track timing @@ -177,7 +186,7 @@ def create(self, **kwargs): if self.proxy_client.fallback_client: logger.warning("All proxy providers failed, using fallback client") try: - return self.proxy_client.fallback_client.chat.completions.create(**kwargs) + return self.proxy_client.fallback_client.chat.completions.create(**self._filter_kwargs(kwargs)) except Exception as e: errors.append(("fallback_client", str(e))) diff --git a/optillm/server.py b/optillm/server.py index 29271d34..afb99c98 100644 --- a/optillm/server.py +++ b/optillm/server.py @@ -992,12 +992,14 @@ def main(): global request_batcher global conversation_logger # Call this function at the start of main() + + # Load plugins first so they're available in argument parser + load_plugins() + args = parse_args() # Update server_config with all argument values server_config.update(vars(args)) - load_plugins() - port = server_config['port'] # Initialize request batcher if batch mode is enabled diff --git a/pyproject.toml b/pyproject.toml index 74c6479b..8e180219 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "optillm" -version = "0.2.3" +version = "0.2.4" description = "An optimizing inference proxy for LLMs." readme = "README.md" license = "Apache-2.0" From e01d50284e953c8c62cbf0248a7010c4a81ea29f Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Mon, 8 Sep 2025 18:19:37 +0800 Subject: [PATCH 3/3] Update README.md --- optillm/plugins/proxy/README.md | 48 ++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/optillm/plugins/proxy/README.md b/optillm/plugins/proxy/README.md index 3b85e6f6..05a55867 100644 --- a/optillm/plugins/proxy/README.md +++ b/optillm/plugins/proxy/README.md @@ -48,18 +48,33 @@ routing: ### 2. Start OptiLLM Server ```bash -# Start server normally +# Option A: Use proxy as default for ALL requests (recommended) +optillm --approach proxy + +# Option B: Start server normally (use model prefix or extra_body per request) optillm # With custom port -optillm --port 8000 +optillm --approach proxy --port 8000 ``` -> **Note**: The `--approach proxy` flag is not currently supported in the command-line interface. - ### 3. Usage Examples -#### Method 1: Using Model Prefix +#### Method 1: Using --approach proxy (Recommended) +```bash +# Start server with proxy as default approach +optillm --approach proxy + +# Then make normal requests - proxy handles all routing automatically! +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gpt-4", + "messages": [{"role": "user", "content": "Hello"}] + }' +``` + +#### Method 2: Using Model Prefix (when server started without --approach proxy) ```bash # Use "proxy-" prefix to activate the proxy plugin curl -X POST http://localhost:8000/v1/chat/completions \ @@ -70,7 +85,7 @@ curl -X POST http://localhost:8000/v1/chat/completions \ }' ``` -#### Method 2: Using extra_body (Recommended for SDK usage) +#### Method 3: Using extra_body (when server started without --approach proxy) ```bash # Use extra_body parameter curl -X POST http://localhost:8000/v1/chat/completions \ @@ -337,13 +352,20 @@ client = OpenAI( api_key="dummy" # Can be any string when using proxy ) -# Method 1: Use proxy with model prefix +# Method 1: Server started with --approach proxy (recommended) +# Just make normal requests - proxy handles everything! +response = client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": "Hello"}] +) + +# Method 2: Use proxy with model prefix response = client.chat.completions.create( model="proxy-gpt-4", # Use "proxy-" prefix messages=[{"role": "user", "content": "Hello"}] ) -# Method 2: Use extra_body (recommended) +# Method 3: Use extra_body response = client.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": "Hello"}], @@ -352,7 +374,7 @@ response = client.chat.completions.create( } ) -# Method 3: Proxy wrapping another approach +# Method 4: Proxy wrapping another approach response = client.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": "Hello"}], @@ -367,7 +389,13 @@ response = client.chat.completions.create( ```python from langchain.llms import OpenAI -# Use proxy with model prefix +# If server started with --approach proxy (recommended) +llm = OpenAI( + openai_api_base="http://localhost:8000/v1", + model_name="gpt-4" # Proxy handles routing automatically +) + +# Or use proxy with model prefix llm = OpenAI( openai_api_base="http://localhost:8000/v1", model_name="proxy-gpt-4" # Use "proxy-" prefix