diff --git a/optillm/__init__.py b/optillm/__init__.py index 090f917a..994b643f 100644 --- a/optillm/__init__.py +++ b/optillm/__init__.py @@ -1,5 +1,5 @@ # Version information -__version__ = "0.2.3" +__version__ = "0.2.4" # Import from server module from .server import ( diff --git a/optillm/plugins/proxy/README.md b/optillm/plugins/proxy/README.md index 71424db4..05a55867 100644 --- a/optillm/plugins/proxy/README.md +++ b/optillm/plugins/proxy/README.md @@ -51,7 +51,7 @@ routing: # Option A: Use proxy as default for ALL requests (recommended) optillm --approach proxy -# Option B: Start server normally (requires model prefix or extra_body) +# Option B: Start server normally (use model prefix or extra_body per request) optillm # With custom port @@ -60,33 +60,34 @@ optillm --approach proxy --port 8000 ### 3. Usage Examples -#### When using `--approach proxy` (Recommended) +#### Method 1: Using --approach proxy (Recommended) ```bash -# No need for "proxy-" prefix! The proxy handles all requests automatically +# Start server with proxy as default approach +optillm --approach proxy + +# Then make normal requests - proxy handles all routing automatically! curl -X POST http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "gpt-4", "messages": [{"role": "user", "content": "Hello"}] }' - -# The proxy will: -# 1. Route to one of your configured providers -# 2. Apply model mapping if configured -# 3. Handle failover automatically ``` -#### Without `--approach proxy` flag +#### Method 2: Using Model Prefix (when server started without --approach proxy) ```bash -# Method 1: Use model prefix +# Use "proxy-" prefix to activate the proxy plugin curl -X POST http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "proxy-gpt-4", "messages": [{"role": "user", "content": "Hello"}] }' +``` -# Method 2: Use extra_body +#### Method 3: Using extra_body (when server started without --approach proxy) +```bash +# Use extra_body parameter curl -X POST http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ @@ -98,44 +99,37 @@ curl -X POST http://localhost:8000/v1/chat/completions \ }' ``` -#### Proxy with Approach/Plugin +Both methods will: +- Route to one of your configured providers +- Apply model mapping if configured +- Handle failover automatically + +#### Combined Approaches ```bash -# Use MOA approach with proxy load balancing +# Apply BON sampling, then route through proxy curl -X POST http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "gpt-4", - "messages": [{"role": "user", "content": "Solve this problem"}], - "extra_body": { - "optillm_approach": "proxy", - "proxy_wrap": "moa" - } + "model": "bon&proxy-gpt-4", + "messages": [{"role": "user", "content": "Generate ideas"}] }' +``` -# Use memory plugin with proxy +#### Proxy Wrapping Other Approaches +```bash +# Use proxy to wrap MOA approach curl -X POST http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "gpt-4", - "messages": [{"role": "user", "content": "Remember this"}], + "messages": [{"role": "user", "content": "Solve this problem"}], "extra_body": { "optillm_approach": "proxy", - "proxy_wrap": "memory" + "proxy_wrap": "moa" } }' ``` -#### Combined Approaches -```bash -# Apply BON sampling, then route through proxy -curl -X POST http://localhost:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "bon&proxy-gpt-4", - "messages": [{"role": "user", "content": "Generate ideas"}] - }' -``` - ## Configuration Reference ### Provider Configuration @@ -203,7 +197,7 @@ providers: ### Model-Specific Routing -When using `--approach proxy`, the proxy automatically maps model names to provider-specific deployments: +The proxy automatically maps model names to provider-specific deployments: ```yaml providers: @@ -222,9 +216,9 @@ providers: # No model_map needed - uses model names as-is ``` -With this configuration and `optillm --approach proxy`: -- Request for "gpt-4" → Azure uses "gpt-4-deployment-001", OpenAI uses "gpt-4" -- Request for "gpt-3.5-turbo" → Azure uses "gpt-35-turbo-deployment", OpenAI uses "gpt-3.5-turbo" +With this configuration and `proxy-gpt-4` model requests: +- Request for "proxy-gpt-4" → Azure uses "gpt-4-deployment-001", OpenAI uses "gpt-4" +- Request for "proxy-gpt-3.5-turbo" → Azure uses "gpt-35-turbo-deployment", OpenAI uses "gpt-3.5-turbo" ### Failover Configuration @@ -358,19 +352,35 @@ client = OpenAI( api_key="dummy" # Can be any string when using proxy ) -# If server started with --approach proxy: +# Method 1: Server started with --approach proxy (recommended) +# Just make normal requests - proxy handles everything! +response = client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": "Hello"}] +) + +# Method 2: Use proxy with model prefix response = client.chat.completions.create( - model="gpt-4", # No "proxy-" prefix needed! + model="proxy-gpt-4", # Use "proxy-" prefix messages=[{"role": "user", "content": "Hello"}] ) -# Or explicitly use proxy with another approach: +# Method 3: Use extra_body +response = client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": "Hello"}], + extra_body={ + "optillm_approach": "proxy" + } +) + +# Method 4: Proxy wrapping another approach response = client.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": "Hello"}], extra_body={ "optillm_approach": "proxy", - "proxy_wrap": "moa" # Proxy will route MOA's requests + "proxy_wrap": "moa" } ) ``` @@ -379,12 +389,18 @@ response = client.chat.completions.create( ```python from langchain.llms import OpenAI -# If server started with --approach proxy: +# If server started with --approach proxy (recommended) llm = OpenAI( openai_api_base="http://localhost:8000/v1", model_name="gpt-4" # Proxy handles routing automatically ) +# Or use proxy with model prefix +llm = OpenAI( + openai_api_base="http://localhost:8000/v1", + model_name="proxy-gpt-4" # Use "proxy-" prefix +) + response = llm("What is the meaning of life?") ``` diff --git a/optillm/plugins/proxy/client.py b/optillm/plugins/proxy/client.py index 3b5a7e0a..2b36d18c 100644 --- a/optillm/plugins/proxy/client.py +++ b/optillm/plugins/proxy/client.py @@ -114,6 +114,15 @@ class _Completions: def __init__(self, proxy_client): self.proxy_client = proxy_client + def _filter_kwargs(self, kwargs: dict) -> dict: + """Filter out OptiLLM-specific parameters that shouldn't be sent to providers""" + optillm_params = { + 'optillm_approach', 'proxy_wrap', 'wrapped_approach', 'wrap', + 'mcts_simulations', 'mcts_exploration', 'mcts_depth', + 'best_of_n', 'rstar_max_depth', 'rstar_num_rollouts', 'rstar_c' + } + return {k: v for k, v in kwargs.items() if k not in optillm_params} + def create(self, **kwargs): """Create completion with load balancing and failover""" model = kwargs.get('model', 'unknown') @@ -145,8 +154,8 @@ def create(self, **kwargs): attempted_providers.add(provider) try: - # Map model name if needed - request_kwargs = kwargs.copy() + # Map model name if needed and filter out OptiLLM-specific parameters + request_kwargs = self._filter_kwargs(kwargs.copy()) request_kwargs['model'] = provider.map_model(model) # Track timing @@ -177,7 +186,7 @@ def create(self, **kwargs): if self.proxy_client.fallback_client: logger.warning("All proxy providers failed, using fallback client") try: - return self.proxy_client.fallback_client.chat.completions.create(**kwargs) + return self.proxy_client.fallback_client.chat.completions.create(**self._filter_kwargs(kwargs)) except Exception as e: errors.append(("fallback_client", str(e))) diff --git a/optillm/server.py b/optillm/server.py index 29271d34..afb99c98 100644 --- a/optillm/server.py +++ b/optillm/server.py @@ -992,12 +992,14 @@ def main(): global request_batcher global conversation_logger # Call this function at the start of main() + + # Load plugins first so they're available in argument parser + load_plugins() + args = parse_args() # Update server_config with all argument values server_config.update(vars(args)) - load_plugins() - port = server_config['port'] # Initialize request batcher if batch mode is enabled diff --git a/pyproject.toml b/pyproject.toml index 74c6479b..8e180219 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "optillm" -version = "0.2.3" +version = "0.2.4" description = "An optimizing inference proxy for LLMs." readme = "README.md" license = "Apache-2.0"