algorithmicsuperintelligence · codelion · Sep 8, 2025 · Sep 8, 2025 · Sep 8, 2025 · Sep 8, 2025
diff --git a/optillm/__init__.py b/optillm/__init__.py
@@ -1,5 +1,5 @@
 # Version information
-__version__ = "0.2.3"
+__version__ = "0.2.4"
 
 # Import from server module
 from .server import (

diff --git a/optillm/plugins/proxy/README.md b/optillm/plugins/proxy/README.md
@@ -51,7 +51,7 @@ routing:
 # Option A: Use proxy as default for ALL requests (recommended)
 optillm --approach proxy
 
-# Option B: Start server normally (requires model prefix or extra_body)
+# Option B: Start server normally (use model prefix or extra_body per request)
 optillm
 
 # With custom port
@@ -60,33 +60,34 @@ optillm --approach proxy --port 8000
 
 ### 3. Usage Examples
 
-#### When using `--approach proxy` (Recommended)
+#### Method 1: Using --approach proxy (Recommended)
 ```bash
-# No need for "proxy-" prefix! The proxy handles all requests automatically
+# Start server with proxy as default approach
+optillm --approach proxy
+
+# Then make normal requests - proxy handles all routing automatically!
 curl -X POST http://localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
     "model": "gpt-4",
     "messages": [{"role": "user", "content": "Hello"}]
   }'
-
-# The proxy will:
-# 1. Route to one of your configured providers
-# 2. Apply model mapping if configured
-# 3. Handle failover automatically
 ```
 
-#### Without `--approach proxy` flag
+#### Method 2: Using Model Prefix (when server started without --approach proxy)
 ```bash
-# Method 1: Use model prefix
+# Use "proxy-" prefix to activate the proxy plugin
 curl -X POST http://localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
     "model": "proxy-gpt-4",
     "messages": [{"role": "user", "content": "Hello"}]
   }'
+```
 
-# Method 2: Use extra_body
+#### Method 3: Using extra_body (when server started without --approach proxy)
+```bash
+# Use extra_body parameter  
 curl -X POST http://localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
@@ -98,44 +99,37 @@ curl -X POST http://localhost:8000/v1/chat/completions \
   }'
 ```
 
-#### Proxy with Approach/Plugin
+Both methods will:
+- Route to one of your configured providers
+- Apply model mapping if configured  
+- Handle failover automatically
+
+#### Combined Approaches
 ```bash
-# Use MOA approach with proxy load balancing
+# Apply BON sampling, then route through proxy
 curl -X POST http://localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "gpt-4",
-    "messages": [{"role": "user", "content": "Solve this problem"}],
-    "extra_body": {
-      "optillm_approach": "proxy",
-      "proxy_wrap": "moa"
-    }
+    "model": "bon&proxy-gpt-4",
+    "messages": [{"role": "user", "content": "Generate ideas"}]
   }'
+```
 
-# Use memory plugin with proxy
+#### Proxy Wrapping Other Approaches
+```bash
+# Use proxy to wrap MOA approach
 curl -X POST http://localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
     "model": "gpt-4",
-    "messages": [{"role": "user", "content": "Remember this"}],
+    "messages": [{"role": "user", "content": "Solve this problem"}],
     "extra_body": {
       "optillm_approach": "proxy",
-      "proxy_wrap": "memory"
+      "proxy_wrap": "moa"
     }
   }'
 ```
 
-#### Combined Approaches
-```bash
-# Apply BON sampling, then route through proxy
-curl -X POST http://localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "bon&proxy-gpt-4",
-    "messages": [{"role": "user", "content": "Generate ideas"}]
-  }'
-```
-
 ## Configuration Reference
 
 ### Provider Configuration
@@ -203,7 +197,7 @@ providers:
 
 ### Model-Specific Routing
 
-When using `--approach proxy`, the proxy automatically maps model names to provider-specific deployments:
+The proxy automatically maps model names to provider-specific deployments:
 
 ```yaml
 providers:
@@ -222,9 +216,9 @@ providers:
     # No model_map needed - uses model names as-is
 ```
 
-With this configuration and `optillm --approach proxy`:
-- Request for "gpt-4" → Azure uses "gpt-4-deployment-001", OpenAI uses "gpt-4"
-- Request for "gpt-3.5-turbo" → Azure uses "gpt-35-turbo-deployment", OpenAI uses "gpt-3.5-turbo"
+With this configuration and `proxy-gpt-4` model requests:
+- Request for "proxy-gpt-4" → Azure uses "gpt-4-deployment-001", OpenAI uses "gpt-4"
+- Request for "proxy-gpt-3.5-turbo" → Azure uses "gpt-35-turbo-deployment", OpenAI uses "gpt-3.5-turbo"
 
 ### Failover Configuration
 
@@ -358,19 +352,35 @@ client = OpenAI(
     api_key="dummy"  # Can be any string when using proxy
 )
 
-# If server started with --approach proxy:
+# Method 1: Server started with --approach proxy (recommended)
+# Just make normal requests - proxy handles everything!
+response = client.chat.completions.create(
+    model="gpt-4",
+    messages=[{"role": "user", "content": "Hello"}]
+)
+
+# Method 2: Use proxy with model prefix
 response = client.chat.completions.create(
-    model="gpt-4",  # No "proxy-" prefix needed!
+    model="proxy-gpt-4",  # Use "proxy-" prefix
     messages=[{"role": "user", "content": "Hello"}]
 )
 
-# Or explicitly use proxy with another approach:
+# Method 3: Use extra_body
+response = client.chat.completions.create(
+    model="gpt-4",
+    messages=[{"role": "user", "content": "Hello"}],
+    extra_body={
+        "optillm_approach": "proxy"
+    }
+)
+
+# Method 4: Proxy wrapping another approach
 response = client.chat.completions.create(
     model="gpt-4",
     messages=[{"role": "user", "content": "Hello"}],
     extra_body={
         "optillm_approach": "proxy",
-        "proxy_wrap": "moa"  # Proxy will route MOA's requests
+        "proxy_wrap": "moa"
     }
 )
 ```
@@ -379,12 +389,18 @@ response = client.chat.completions.create(
 ```python
 from langchain.llms import OpenAI
 
-# If server started with --approach proxy:
+# If server started with --approach proxy (recommended)
 llm = OpenAI(
     openai_api_base="http://localhost:8000/v1",
     model_name="gpt-4"  # Proxy handles routing automatically
 )
 
+# Or use proxy with model prefix
+llm = OpenAI(
+    openai_api_base="http://localhost:8000/v1",
+    model_name="proxy-gpt-4"  # Use "proxy-" prefix
+)
+
 response = llm("What is the meaning of life?")
 ```
 

diff --git a/optillm/plugins/proxy/client.py b/optillm/plugins/proxy/client.py
@@ -114,6 +114,15 @@ class _Completions:
         def __init__(self, proxy_client):
             self.proxy_client = proxy_client
 
+        def _filter_kwargs(self, kwargs: dict) -> dict:
+            """Filter out OptiLLM-specific parameters that shouldn't be sent to providers"""
+            optillm_params = {
+                'optillm_approach', 'proxy_wrap', 'wrapped_approach', 'wrap',
+                'mcts_simulations', 'mcts_exploration', 'mcts_depth',
+                'best_of_n', 'rstar_max_depth', 'rstar_num_rollouts', 'rstar_c'
+            }
+            return {k: v for k, v in kwargs.items() if k not in optillm_params}
+
         def create(self, **kwargs):
             """Create completion with load balancing and failover"""
             model = kwargs.get('model', 'unknown')
@@ -145,8 +154,8 @@ def create(self, **kwargs):
                 attempted_providers.add(provider)
 
                 try:
-                    # Map model name if needed
-                    request_kwargs = kwargs.copy()
+                    # Map model name if needed and filter out OptiLLM-specific parameters
+                    request_kwargs = self._filter_kwargs(kwargs.copy())
                     request_kwargs['model'] = provider.map_model(model)
 
                     # Track timing
@@ -177,7 +186,7 @@ def create(self, **kwargs):
             if self.proxy_client.fallback_client:
                 logger.warning("All proxy providers failed, using fallback client")
                 try:
-                    return self.proxy_client.fallback_client.chat.completions.create(**kwargs)
+                    return self.proxy_client.fallback_client.chat.completions.create(**self._filter_kwargs(kwargs))
                 except Exception as e:
                     errors.append(("fallback_client", str(e)))
 

diff --git a/optillm/server.py b/optillm/server.py
@@ -992,12 +992,14 @@ def main():
     global request_batcher
     global conversation_logger
     # Call this function at the start of main()
+
+    # Load plugins first so they're available in argument parser
+    load_plugins()
+
     args = parse_args()
     # Update server_config with all argument values
     server_config.update(vars(args))
 
-    load_plugins()
-
     port = server_config['port']
 
     # Initialize request batcher if batch mode is enabled

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "optillm"
-version = "0.2.3"
+version = "0.2.4"
 description = "An optimizing inference proxy for LLMs."
 readme = "README.md"
 license = "Apache-2.0"