From 23cbb91584b09bbea76fb453d7858081c00a702b Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Mon, 8 Sep 2025 18:05:33 +0800
Subject: [PATCH 1/3] Update README.md

---
 optillm/plugins/proxy/README.md | 96 ++++++++-------------------------
 1 file changed, 21 insertions(+), 75 deletions(-)

diff --git a/optillm/plugins/proxy/README.md b/optillm/plugins/proxy/README.md
index 71424db4..d7005f09 100644
--- a/optillm/plugins/proxy/README.md
+++ b/optillm/plugins/proxy/README.md
@@ -48,82 +48,36 @@ routing:
 ### 2. Start OptiLLM Server
 
 ```bash
-# Option A: Use proxy as default for ALL requests (recommended)
-optillm --approach proxy
-
-# Option B: Start server normally (requires model prefix or extra_body)
+# Start server normally 
 optillm
 
 # With custom port
-optillm --approach proxy --port 8000
+optillm --port 8000
 ```
 
+> **Note**: The `--approach proxy` flag is not currently supported. Use the model prefix method below.
+
 ### 3. Usage Examples
 
-#### When using `--approach proxy` (Recommended)
+#### Using Model Prefix (Currently the only working method)
 ```bash
-# No need for "proxy-" prefix! The proxy handles all requests automatically
+# Use "proxy-" prefix to activate the proxy plugin
 curl -X POST http://localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "gpt-4",
+    "model": "proxy-gpt-4",
     "messages": [{"role": "user", "content": "Hello"}]
   }'
 
 # The proxy will:
 # 1. Route to one of your configured providers
-# 2. Apply model mapping if configured
+# 2. Apply model mapping if configured  
 # 3. Handle failover automatically
 ```
 
-#### Without `--approach proxy` flag
-```bash
-# Method 1: Use model prefix
-curl -X POST http://localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "proxy-gpt-4",
-    "messages": [{"role": "user", "content": "Hello"}]
-  }'
-
-# Method 2: Use extra_body
-curl -X POST http://localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "gpt-4",
-    "messages": [{"role": "user", "content": "Hello"}],
-    "extra_body": {
-      "optillm_approach": "proxy"
-    }
-  }'
-```
-
-#### Proxy with Approach/Plugin
-```bash
-# Use MOA approach with proxy load balancing
-curl -X POST http://localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "gpt-4",
-    "messages": [{"role": "user", "content": "Solve this problem"}],
-    "extra_body": {
-      "optillm_approach": "proxy",
-      "proxy_wrap": "moa"
-    }
-  }'
-
-# Use memory plugin with proxy
-curl -X POST http://localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "gpt-4",
-    "messages": [{"role": "user", "content": "Remember this"}],
-    "extra_body": {
-      "optillm_approach": "proxy",
-      "proxy_wrap": "memory"
-    }
-  }'
-```
+> **Known Issues**: 
+> - `--approach proxy` flag: Not supported in command-line interface
+> - `extra_body` method: Currently broken due to parsing bug in server code
 
 #### Combined Approaches
 ```bash
@@ -136,6 +90,8 @@ curl -X POST http://localhost:8000/v1/chat/completions \
   }'
 ```
 
+> **Note**: The proxy wrapping functionality (`proxy_wrap`) is currently not accessible via the working model prefix method. This would require the `extra_body` approach which is currently broken.
+
 ## Configuration Reference
 
 ### Provider Configuration
@@ -203,7 +159,7 @@ providers:
 
 ### Model-Specific Routing
 
-When using `--approach proxy`, the proxy automatically maps model names to provider-specific deployments:
+The proxy automatically maps model names to provider-specific deployments:
 
 ```yaml
 providers:
@@ -222,9 +178,9 @@ providers:
     # No model_map needed - uses model names as-is
 ```
 
-With this configuration and `optillm --approach proxy`:
-- Request for "gpt-4" → Azure uses "gpt-4-deployment-001", OpenAI uses "gpt-4"
-- Request for "gpt-3.5-turbo" → Azure uses "gpt-35-turbo-deployment", OpenAI uses "gpt-3.5-turbo"
+With this configuration and `proxy-gpt-4` model requests:
+- Request for "proxy-gpt-4" → Azure uses "gpt-4-deployment-001", OpenAI uses "gpt-4"
+- Request for "proxy-gpt-3.5-turbo" → Azure uses "gpt-35-turbo-deployment", OpenAI uses "gpt-3.5-turbo"
 
 ### Failover Configuration
 
@@ -358,31 +314,21 @@ client = OpenAI(
     api_key="dummy"  # Can be any string when using proxy
 )
 
-# If server started with --approach proxy:
+# Use proxy with model prefix (currently the only working method)
 response = client.chat.completions.create(
-    model="gpt-4",  # No "proxy-" prefix needed!
+    model="proxy-gpt-4",  # Use "proxy-" prefix
     messages=[{"role": "user", "content": "Hello"}]
 )
-
-# Or explicitly use proxy with another approach:
-response = client.chat.completions.create(
-    model="gpt-4",
-    messages=[{"role": "user", "content": "Hello"}],
-    extra_body={
-        "optillm_approach": "proxy",
-        "proxy_wrap": "moa"  # Proxy will route MOA's requests
-    }
-)
 ```
 
 ### With LangChain
 ```python
 from langchain.llms import OpenAI
 
-# If server started with --approach proxy:
+# Use proxy with model prefix
 llm = OpenAI(
     openai_api_base="http://localhost:8000/v1",
-    model_name="gpt-4"  # Proxy handles routing automatically
+    model_name="proxy-gpt-4"  # Use "proxy-" prefix
 )
 
 response = llm("What is the meaning of life?")

From 4a8f10c9637733f03344592d85103dbc38381cf2 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Mon, 8 Sep 2025 18:18:11 +0800
Subject: [PATCH 2/3] fixes

---
 optillm/__init__.py             |  2 +-
 optillm/plugins/proxy/README.md | 64 +++++++++++++++++++++++++++------
 optillm/plugins/proxy/client.py | 15 ++++++--
 optillm/server.py               |  6 ++--
 pyproject.toml                  |  2 +-
 5 files changed, 71 insertions(+), 18 deletions(-)

diff --git a/optillm/__init__.py b/optillm/__init__.py
index 090f917a..994b643f 100644
--- a/optillm/__init__.py
+++ b/optillm/__init__.py
@@ -1,5 +1,5 @@
 # Version information
-__version__ = "0.2.3"
+__version__ = "0.2.4"
 
 # Import from server module
 from .server import (
diff --git a/optillm/plugins/proxy/README.md b/optillm/plugins/proxy/README.md
index d7005f09..3b85e6f6 100644
--- a/optillm/plugins/proxy/README.md
+++ b/optillm/plugins/proxy/README.md
@@ -55,11 +55,11 @@ optillm
 optillm --port 8000
 ```
 
-> **Note**: The `--approach proxy` flag is not currently supported. Use the model prefix method below.
+> **Note**: The `--approach proxy` flag is not currently supported in the command-line interface.
 
 ### 3. Usage Examples
 
-#### Using Model Prefix (Currently the only working method)
+#### Method 1: Using Model Prefix
 ```bash
 # Use "proxy-" prefix to activate the proxy plugin
 curl -X POST http://localhost:8000/v1/chat/completions \
@@ -68,16 +68,26 @@ curl -X POST http://localhost:8000/v1/chat/completions \
     "model": "proxy-gpt-4",
     "messages": [{"role": "user", "content": "Hello"}]
   }'
+```
 
-# The proxy will:
-# 1. Route to one of your configured providers
-# 2. Apply model mapping if configured  
-# 3. Handle failover automatically
+#### Method 2: Using extra_body (Recommended for SDK usage)
+```bash
+# Use extra_body parameter  
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4",
+    "messages": [{"role": "user", "content": "Hello"}],
+    "extra_body": {
+      "optillm_approach": "proxy"
+    }
+  }'
 ```
 
-> **Known Issues**: 
-> - `--approach proxy` flag: Not supported in command-line interface
-> - `extra_body` method: Currently broken due to parsing bug in server code
+Both methods will:
+- Route to one of your configured providers
+- Apply model mapping if configured  
+- Handle failover automatically
 
 #### Combined Approaches
 ```bash
@@ -90,7 +100,20 @@ curl -X POST http://localhost:8000/v1/chat/completions \
   }'
 ```
 
-> **Note**: The proxy wrapping functionality (`proxy_wrap`) is currently not accessible via the working model prefix method. This would require the `extra_body` approach which is currently broken.
+#### Proxy Wrapping Other Approaches
+```bash
+# Use proxy to wrap MOA approach
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4",
+    "messages": [{"role": "user", "content": "Solve this problem"}],
+    "extra_body": {
+      "optillm_approach": "proxy",
+      "proxy_wrap": "moa"
+    }
+  }'
+```
 
 ## Configuration Reference
 
@@ -314,11 +337,30 @@ client = OpenAI(
     api_key="dummy"  # Can be any string when using proxy
 )
 
-# Use proxy with model prefix (currently the only working method)
+# Method 1: Use proxy with model prefix
 response = client.chat.completions.create(
     model="proxy-gpt-4",  # Use "proxy-" prefix
     messages=[{"role": "user", "content": "Hello"}]
 )
+
+# Method 2: Use extra_body (recommended)
+response = client.chat.completions.create(
+    model="gpt-4",
+    messages=[{"role": "user", "content": "Hello"}],
+    extra_body={
+        "optillm_approach": "proxy"
+    }
+)
+
+# Method 3: Proxy wrapping another approach
+response = client.chat.completions.create(
+    model="gpt-4",
+    messages=[{"role": "user", "content": "Hello"}],
+    extra_body={
+        "optillm_approach": "proxy",
+        "proxy_wrap": "moa"
+    }
+)
 ```
 
 ### With LangChain
diff --git a/optillm/plugins/proxy/client.py b/optillm/plugins/proxy/client.py
index 3b5a7e0a..2b36d18c 100644
--- a/optillm/plugins/proxy/client.py
+++ b/optillm/plugins/proxy/client.py
@@ -114,6 +114,15 @@ class _Completions:
         def __init__(self, proxy_client):
             self.proxy_client = proxy_client
         
+        def _filter_kwargs(self, kwargs: dict) -> dict:
+            """Filter out OptiLLM-specific parameters that shouldn't be sent to providers"""
+            optillm_params = {
+                'optillm_approach', 'proxy_wrap', 'wrapped_approach', 'wrap',
+                'mcts_simulations', 'mcts_exploration', 'mcts_depth',
+                'best_of_n', 'rstar_max_depth', 'rstar_num_rollouts', 'rstar_c'
+            }
+            return {k: v for k, v in kwargs.items() if k not in optillm_params}
+        
         def create(self, **kwargs):
             """Create completion with load balancing and failover"""
             model = kwargs.get('model', 'unknown')
@@ -145,8 +154,8 @@ def create(self, **kwargs):
                 attempted_providers.add(provider)
                 
                 try:
-                    # Map model name if needed
-                    request_kwargs = kwargs.copy()
+                    # Map model name if needed and filter out OptiLLM-specific parameters
+                    request_kwargs = self._filter_kwargs(kwargs.copy())
                     request_kwargs['model'] = provider.map_model(model)
                     
                     # Track timing
@@ -177,7 +186,7 @@ def create(self, **kwargs):
             if self.proxy_client.fallback_client:
                 logger.warning("All proxy providers failed, using fallback client")
                 try:
-                    return self.proxy_client.fallback_client.chat.completions.create(**kwargs)
+                    return self.proxy_client.fallback_client.chat.completions.create(**self._filter_kwargs(kwargs))
                 except Exception as e:
                     errors.append(("fallback_client", str(e)))
             
diff --git a/optillm/server.py b/optillm/server.py
index 29271d34..afb99c98 100644
--- a/optillm/server.py
+++ b/optillm/server.py
@@ -992,12 +992,14 @@ def main():
     global request_batcher
     global conversation_logger
     # Call this function at the start of main()
+    
+    # Load plugins first so they're available in argument parser
+    load_plugins()
+    
     args = parse_args()
     # Update server_config with all argument values
     server_config.update(vars(args))
 
-    load_plugins()
-
     port = server_config['port']
     
     # Initialize request batcher if batch mode is enabled
diff --git a/pyproject.toml b/pyproject.toml
index 74c6479b..8e180219 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "optillm"
-version = "0.2.3"
+version = "0.2.4"
 description = "An optimizing inference proxy for LLMs."
 readme = "README.md"
 license = "Apache-2.0"

From e01d50284e953c8c62cbf0248a7010c4a81ea29f Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Mon, 8 Sep 2025 18:19:37 +0800
Subject: [PATCH 3/3] Update README.md

---
 optillm/plugins/proxy/README.md | 48 ++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/optillm/plugins/proxy/README.md b/optillm/plugins/proxy/README.md
index 3b85e6f6..05a55867 100644
--- a/optillm/plugins/proxy/README.md
+++ b/optillm/plugins/proxy/README.md
@@ -48,18 +48,33 @@ routing:
 ### 2. Start OptiLLM Server
 
 ```bash
-# Start server normally 
+# Option A: Use proxy as default for ALL requests (recommended)
+optillm --approach proxy
+
+# Option B: Start server normally (use model prefix or extra_body per request)
 optillm
 
 # With custom port
-optillm --port 8000
+optillm --approach proxy --port 8000
 ```
 
-> **Note**: The `--approach proxy` flag is not currently supported in the command-line interface.
-
 ### 3. Usage Examples
 
-#### Method 1: Using Model Prefix
+#### Method 1: Using --approach proxy (Recommended)
+```bash
+# Start server with proxy as default approach
+optillm --approach proxy
+
+# Then make normal requests - proxy handles all routing automatically!
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4",
+    "messages": [{"role": "user", "content": "Hello"}]
+  }'
+```
+
+#### Method 2: Using Model Prefix (when server started without --approach proxy)
 ```bash
 # Use "proxy-" prefix to activate the proxy plugin
 curl -X POST http://localhost:8000/v1/chat/completions \
@@ -70,7 +85,7 @@ curl -X POST http://localhost:8000/v1/chat/completions \
   }'
 ```
 
-#### Method 2: Using extra_body (Recommended for SDK usage)
+#### Method 3: Using extra_body (when server started without --approach proxy)
 ```bash
 # Use extra_body parameter  
 curl -X POST http://localhost:8000/v1/chat/completions \
@@ -337,13 +352,20 @@ client = OpenAI(
     api_key="dummy"  # Can be any string when using proxy
 )
 
-# Method 1: Use proxy with model prefix
+# Method 1: Server started with --approach proxy (recommended)
+# Just make normal requests - proxy handles everything!
+response = client.chat.completions.create(
+    model="gpt-4",
+    messages=[{"role": "user", "content": "Hello"}]
+)
+
+# Method 2: Use proxy with model prefix
 response = client.chat.completions.create(
     model="proxy-gpt-4",  # Use "proxy-" prefix
     messages=[{"role": "user", "content": "Hello"}]
 )
 
-# Method 2: Use extra_body (recommended)
+# Method 3: Use extra_body
 response = client.chat.completions.create(
     model="gpt-4",
     messages=[{"role": "user", "content": "Hello"}],
@@ -352,7 +374,7 @@ response = client.chat.completions.create(
     }
 )
 
-# Method 3: Proxy wrapping another approach
+# Method 4: Proxy wrapping another approach
 response = client.chat.completions.create(
     model="gpt-4",
     messages=[{"role": "user", "content": "Hello"}],
@@ -367,7 +389,13 @@ response = client.chat.completions.create(
 ```python
 from langchain.llms import OpenAI
 
-# Use proxy with model prefix
+# If server started with --approach proxy (recommended)
+llm = OpenAI(
+    openai_api_base="http://localhost:8000/v1",
+    model_name="gpt-4"  # Proxy handles routing automatically
+)
+
+# Or use proxy with model prefix
 llm = OpenAI(
     openai_api_base="http://localhost:8000/v1",
     model_name="proxy-gpt-4"  # Use "proxy-" prefix