fixes

codelion · codelion · commit 4a8f10c96377 · 2025-09-08T18:18:11.000+08:00
diff --git a/optillm/__init__.py b/optillm/__init__.py
@@ -1,5 +1,5 @@
 # Version information
-__version__ = "0.2.3"
+__version__ = "0.2.4"
 
 # Import from server module
 from .server import (
diff --git a/optillm/plugins/proxy/README.md b/optillm/plugins/proxy/README.md
@@ -55,11 +55,11 @@ optillm
 optillm --port 8000
 ```
 
-> **Note**: The `--approach proxy` flag is not currently supported. Use the model prefix method below.
+> **Note**: The `--approach proxy` flag is not currently supported in the command-line interface.
 
 ### 3. Usage Examples
 
-#### Using Model Prefix (Currently the only working method)
+#### Method 1: Using Model Prefix
 ```bash
 # Use "proxy-" prefix to activate the proxy plugin
 curl -X POST http://localhost:8000/v1/chat/completions \
@@ -68,16 +68,26 @@ curl -X POST http://localhost:8000/v1/chat/completions \
     "model": "proxy-gpt-4",
     "messages": [{"role": "user", "content": "Hello"}]
   }'
+```
 
-# The proxy will:
-# 1. Route to one of your configured providers
-# 2. Apply model mapping if configured  
-# 3. Handle failover automatically
+#### Method 2: Using extra_body (Recommended for SDK usage)
+```bash
+# Use extra_body parameter  
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4",
+    "messages": [{"role": "user", "content": "Hello"}],
+    "extra_body": {
+      "optillm_approach": "proxy"
+    }
+  }'
 ```
 
-> **Known Issues**: 
-> - `--approach proxy` flag: Not supported in command-line interface
-> - `extra_body` method: Currently broken due to parsing bug in server code
+Both methods will:
+- Route to one of your configured providers
+- Apply model mapping if configured  
+- Handle failover automatically
 
 #### Combined Approaches
 ```bash
@@ -90,7 +100,20 @@ curl -X POST http://localhost:8000/v1/chat/completions \
   }'
 ```
 
-> **Note**: The proxy wrapping functionality (`proxy_wrap`) is currently not accessible via the working model prefix method. This would require the `extra_body` approach which is currently broken.
+#### Proxy Wrapping Other Approaches
+```bash
+# Use proxy to wrap MOA approach
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "gpt-4",
+    "messages": [{"role": "user", "content": "Solve this problem"}],
+    "extra_body": {
+      "optillm_approach": "proxy",
+      "proxy_wrap": "moa"
+    }
+  }'
+```
 
 ## Configuration Reference
 
@@ -314,11 +337,30 @@ client = OpenAI(
     api_key="dummy"  # Can be any string when using proxy
 )
 
-# Use proxy with model prefix (currently the only working method)
+# Method 1: Use proxy with model prefix
 response = client.chat.completions.create(
     model="proxy-gpt-4",  # Use "proxy-" prefix
     messages=[{"role": "user", "content": "Hello"}]
 )
+
+# Method 2: Use extra_body (recommended)
+response = client.chat.completions.create(
+    model="gpt-4",
+    messages=[{"role": "user", "content": "Hello"}],
+    extra_body={
+        "optillm_approach": "proxy"
+    }
+)
+
+# Method 3: Proxy wrapping another approach
+response = client.chat.completions.create(
+    model="gpt-4",
+    messages=[{"role": "user", "content": "Hello"}],
+    extra_body={
+        "optillm_approach": "proxy",
+        "proxy_wrap": "moa"
+    }
+)
 ```
 
 ### With LangChain
diff --git a/optillm/plugins/proxy/client.py b/optillm/plugins/proxy/client.py
@@ -114,6 +114,15 @@ class _Completions:
         def __init__(self, proxy_client):
             self.proxy_client = proxy_client
         
+        def _filter_kwargs(self, kwargs: dict) -> dict:
+            """Filter out OptiLLM-specific parameters that shouldn't be sent to providers"""
+            optillm_params = {
+                'optillm_approach', 'proxy_wrap', 'wrapped_approach', 'wrap',
+                'mcts_simulations', 'mcts_exploration', 'mcts_depth',
+                'best_of_n', 'rstar_max_depth', 'rstar_num_rollouts', 'rstar_c'
+            }
+            return {k: v for k, v in kwargs.items() if k not in optillm_params}
+        
         def create(self, **kwargs):
             """Create completion with load balancing and failover"""
             model = kwargs.get('model', 'unknown')
@@ -145,8 +154,8 @@ def create(self, **kwargs):
                 attempted_providers.add(provider)
                 
                 try:
-                    # Map model name if needed
-                    request_kwargs = kwargs.copy()
+                    # Map model name if needed and filter out OptiLLM-specific parameters
+                    request_kwargs = self._filter_kwargs(kwargs.copy())
                     request_kwargs['model'] = provider.map_model(model)
                     
                     # Track timing
@@ -177,7 +186,7 @@ def create(self, **kwargs):
             if self.proxy_client.fallback_client:
                 logger.warning("All proxy providers failed, using fallback client")
                 try:
-                    return self.proxy_client.fallback_client.chat.completions.create(**kwargs)
+                    return self.proxy_client.fallback_client.chat.completions.create(**self._filter_kwargs(kwargs))
                 except Exception as e:
                     errors.append(("fallback_client", str(e)))
             
diff --git a/optillm/server.py b/optillm/server.py
@@ -992,12 +992,14 @@ def main():
     global request_batcher
     global conversation_logger
     # Call this function at the start of main()
+    
+    # Load plugins first so they're available in argument parser
+    load_plugins()
+    
     args = parse_args()
     # Update server_config with all argument values
     server_config.update(vars(args))
 
-    load_plugins()
-
     port = server_config['port']
     
     # Initialize request batcher if batch mode is enabled
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "optillm"
-version = "0.2.3"
+version = "0.2.4"
 description = "An optimizing inference proxy for LLMs."
 readme = "README.md"
 license = "Apache-2.0"